summaryrefslogtreecommitdiff
path: root/caffe2/utils
diff options
context:
space:
mode:
authorAapo Kyrola <akyrola@fb.com>2017-04-17 21:23:45 -0700
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2017-04-17 21:31:20 -0700
commit9ab077dc9d0bbe651348a498dd5472dc4d51f0af (patch)
tree53060eb0ae8e9e03d765cf80d63e1f37e08ef1c8 /caffe2/utils
parent391fd141150d250ee60027fad4acf8aecefbae51 (diff)
downloadpytorch-9ab077dc9d0bbe651348a498dd5472dc4d51f0af.tar.gz
pytorch-9ab077dc9d0bbe651348a498dd5472dc4d51f0af.tar.bz2
pytorch-9ab077dc9d0bbe651348a498dd5472dc4d51f0af.zip
Revert D4871248: [caffe2][PR] fp16 support for FullyConnected op
Summary: This reverts commit 6a991c2c993dcf0b1e18aa3f2ffbe19e693dbadd Differential Revision: D4871248 fbshipit-source-id: b6d812d09a00c83e363432e84742c503abfed65b
Diffstat (limited to 'caffe2/utils')
-rw-r--r--caffe2/utils/conversions.h182
-rw-r--r--caffe2/utils/math-detail.h36
-rw-r--r--caffe2/utils/math.h53
-rw-r--r--caffe2/utils/math_cpu.cc119
-rw-r--r--caffe2/utils/math_gpu.cu423
-rw-r--r--caffe2/utils/math_gpu_test.cc57
6 files changed, 238 insertions, 632 deletions
diff --git a/caffe2/utils/conversions.h b/caffe2/utils/conversions.h
deleted file mode 100644
index 0c6c3238f7..0000000000
--- a/caffe2/utils/conversions.h
+++ /dev/null
@@ -1,182 +0,0 @@
-#pragma once
-
-#include <caffe2/core/types.h>
-
-#ifdef __CUDA_ARCH__
-#include <cuda_fp16.h>
-#endif
-
-#ifdef __CUDA_ARCH__
-#define CONVERSIONS_DECL __host__ __device__ inline
-#else
-#define CONVERSIONS_DECL inline
-#endif
-
-namespace caffe2 {
-
-namespace convert {
-
-namespace {
-inline float16 cpu_float2half_rn(float f) {
- float16 ret;
-
- static_assert(
- sizeof(unsigned int) == sizeof(float),
- "Programming error sizeof(unsigned int) != sizeof(float)");
-
- unsigned* xp = reinterpret_cast<unsigned int*>(&f);
- unsigned x = *xp;
- unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
- unsigned sign, exponent, mantissa;
-
- // Get rid of +NaN/-NaN case first.
- if (u > 0x7f800000) {
- ret.x = 0x7fffU;
- return ret;
- }
-
- sign = ((x >> 16) & 0x8000);
-
- // Get rid of +Inf/-Inf, +0/-0.
- if (u > 0x477fefff) {
- ret.x = sign | 0x7c00U;
- return ret;
- }
- if (u < 0x33000001) {
- ret.x = (sign | 0x0000);
- return ret;
- }
-
- exponent = ((u >> 23) & 0xff);
- mantissa = (u & 0x7fffff);
-
- if (exponent > 0x70) {
- shift = 13;
- exponent -= 0x70;
- } else {
- shift = 0x7e - exponent;
- exponent = 0;
- mantissa |= 0x800000;
- }
- lsb = (1 << shift);
- lsb_s1 = (lsb >> 1);
- lsb_m1 = (lsb - 1);
-
- // Round to nearest even.
- remainder = (mantissa & lsb_m1);
- mantissa >>= shift;
- if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
- ++mantissa;
- if (!(mantissa & 0x3ff)) {
- ++exponent;
- mantissa = 0;
- }
- }
-
- ret.x = (sign | (exponent << 10) | mantissa);
-
- return ret;
-}
-
-inline float cpu_half2float(float16 h) {
- unsigned sign = ((h.x >> 15) & 1);
- unsigned exponent = ((h.x >> 10) & 0x1f);
- unsigned mantissa = ((h.x & 0x3ff) << 13);
-
- if (exponent == 0x1f) { /* NaN or Inf */
- mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
- exponent = 0xff;
- } else if (!exponent) { /* Denorm or Zero */
- if (mantissa) {
- unsigned int msb;
- exponent = 0x71;
- do {
- msb = (mantissa & 0x400000);
- mantissa <<= 1; /* normalize */
- --exponent;
- } while (!msb);
- mantissa &= 0x7fffff; /* 1.mantissa is implicit */
- }
- } else {
- exponent += 0x70;
- }
-
- int temp = ((sign << 31) | (exponent << 23) | mantissa);
-
- unsigned* rp = reinterpret_cast<unsigned*>(&temp);
- return *rp;
-}
-
-}; // anonymous
-// general version: defer to static_cast
-template <typename IN, typename OUT>
-CONVERSIONS_DECL OUT To(const IN in) {
- return static_cast<OUT>(in);
-}
-
-#if __CUDA_ARCH__
-__device__ __inline__ __half inf_clip(__half h) {
- int isi = __hisinf(h);
- if (isi > 0) {
- // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
- h.x = 0x7bffU;
- } else if (isi < 0) {
- // As above, negated
- h.x = 0x7bffU ^ 0x8000;
- }
- return h;
-}
-#endif
-
-// explicit for fp16
-template <>
-CONVERSIONS_DECL float16 To(const float in) {
-#if __CUDA_ARCH__
- // hacky interface between C2 fp16 and CUDA
- float16 ret;
- __half r;
- // r.x = __float2half_rn(in);
- // ret.x = inf_clip(r).x;
- ret.x = __float2half(in).x;
- return ret;
-#else
- return cpu_float2half_rn(in);
-#endif
-}
-
-template <>
-CONVERSIONS_DECL float To(const float16 in) {
-#if __CUDA_ARCH__
- __half tmp;
- tmp.x = in.x;
- return __half2float(tmp);
-#else
- return cpu_half2float(in);
-#endif
-};
-
-template <>
-CONVERSIONS_DECL float To(const float in) {
- return in;
-}
-
-template <typename OUT, typename IN>
-CONVERSIONS_DECL OUT Get(IN x) {
- return static_cast<OUT>(x);
-}
-
-template <>
-CONVERSIONS_DECL float Get(float16 x) {
- return To<float16, float>(x);
-}
-
-template <>
-CONVERSIONS_DECL float16 Get(float x) {
- return To<float, float16>(x);
-}
-
-}; // namespace convert
-
-}; // namespace caffe2
-
-#undef CONVERSIONS_DECL
diff --git a/caffe2/utils/math-detail.h b/caffe2/utils/math-detail.h
index 07a1f997d6..35a880a6d4 100644
--- a/caffe2/utils/math-detail.h
+++ b/caffe2/utils/math-detail.h
@@ -11,12 +11,8 @@ namespace detail {
template<typename T, class Context, int FixedSize>
struct ScaleImpl {
- inline void operator()(
- const int N,
- const float alpha,
- const T* x,
- T* y,
- Context* context) {
+ inline void
+ operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
Scale(N, alpha, x, y, context);
}
};
@@ -26,7 +22,7 @@ template<typename T>
struct ScaleImpl<T, CPUContext, 1> {
inline void operator()(
const int N,
- const float alpha,
+ const T alpha,
const T* x,
T* y,
CPUContext* context) {
@@ -37,12 +33,8 @@ struct ScaleImpl<T, CPUContext, 1> {
template<typename T, class Context, int FixedSize>
struct AxpyImpl {
- inline void operator()(
- const int N,
- const float alpha,
- const T* x,
- T* y,
- Context* context) {
+ inline void
+ operator()(const int N, const T alpha, const T* x, T* y, Context* context) {
Axpy(N, alpha, x, y, context);
}
};
@@ -52,7 +44,7 @@ template<typename T>
struct AxpyImpl<T, CPUContext, 1> {
inline void operator()(
const int N,
- const float alpha,
+ const T alpha,
const T* x,
T* y,
CPUContext* context) {
@@ -65,22 +57,14 @@ struct AxpyImpl<T, CPUContext, 1> {
} // namespace detail
template <typename T, class Context, int FixedSize>
-inline void ScaleFixedSize(
- const int N,
- const float alpha,
- const T* x,
- T* y,
- Context* context) {
+inline void
+ScaleFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) {
detail::ScaleImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
}
template <typename T, class Context, int FixedSize>
-inline void AxpyFixedSize(
- const int N,
- const float alpha,
- const T* x,
- T* y,
- Context* context) {
+inline void
+AxpyFixedSize(const int N, const T alpha, const T* x, T* y, Context* context) {
detail::AxpyImpl<T, Context, FixedSize>()(N, alpha, x, y, context);
}
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index 105cb19733..a2472c0d33 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -141,20 +141,10 @@ void ColwiseMax(const int N, const int D, const T* x, T* y,
// Decaf gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
-template <typename T, class Context, class Engine = DefaultEngine>
-void Gemm(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const T* A,
- const T* B,
- const float beta,
- T* C,
- Context* context,
- TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+template <typename T, class Context, class Engine=DefaultEngine>
+void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+ const int M, const int N, const int K, const T alpha, const T* A,
+ const T* B, const T beta, T* C, Context* context);
// We also provide a gemm that has explicit lda, ldb and ldc specified.
// In most cases you probably want to use the function above, though.
@@ -179,18 +169,10 @@ void GemmEx(
// to Trans, the output is:
// CblasNoTrans: x is an N dim vector and y is an M dim vector.
// CblasTrans: x is an M dim vector and y is an N dim vector.
-template <typename T, class Context, class Engine = DefaultEngine>
-void Gemv(
- const CBLAS_TRANSPOSE TransA,
- const int M,
- const int N,
- const float alpha,
- const T* A,
- const T* x,
- const float beta,
- T* y,
- Context* context,
- TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
+template <typename T, class Context, class Engine=DefaultEngine>
+void Gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
+ const T alpha, const T* A, const T* x, const T beta,
+ T* y, Context* context);
template <typename T, class Context>
void Set(const TIndex N, const T alpha, T* X, Context* context);
@@ -236,31 +218,28 @@ void Select(const int N, const int D, const T* x, const int* idx, T* y,
Context* context);
template <typename T, class Context>
-void Scale(const int N, const float alpha, const T* x, T* y, Context* context);
+void Scale(const int N, const T alpha, const T* x, T* y, Context* context);
// Different from the Scale function above, if alpha is passed in
// as a pointer, we will assume that it lives on the Context device,
// for example on GPU.
template <typename T, class Context>
-void Scale(const int N, const float* alpha, const T* x, T* y, Context* context);
+void Scale(const int N, const T* alpha, const T* x, T* y,
+ Context* context);
template <typename T, class Context>
-void Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
+void Axpy(const int N, const T alpha, const T* x, T* y, Context* context);
// Different from the Axpy function above, if alpha is passed in
// as a pointer, we will assume that it lives on the Context device,
// for example on GPU.
template <typename T, class Context>
-void Axpy(const int N, const float* alpha, const T* x, T* y, Context* context);
+void Axpy(const int N, const T* alpha, const T* x, T* y,
+ Context* context);
template <typename T, class Context>
-void Axpby(
- const int N,
- const float alpha,
- const T* x,
- const T b,
- T* y,
- Context* context);
+void Axpby(const int N, const T alpha, const T* x, const T b, T* y,
+ Context* context);
template <typename T, class Context, int order>
void Im2colNd(
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index e4340dfaf1..5cac0c8339 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -58,18 +58,9 @@ namespace math {
// CblasTrans, respectively, for each of A and B.
template <>
void Gemm<float, CPUContext>(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const float* A,
- const float* B,
- const float beta,
- float* C,
- CPUContext* context,
- TensorProto::DataType math_type) {
+ const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+ const int M, const int N, const int K, const float alpha, const float* A,
+ const float* B, const float beta, float* C, CPUContext* context) {
auto C_mat = EigenMatrixMap<float>(C, N, M);
if (beta == 0) {
C_mat.setZero();
@@ -187,8 +178,7 @@ void Gemv<float, CPUContext>(
const float* x,
const float beta,
float* y,
- CPUContext* context,
- TensorProto::DataType math_type) {
+ CPUContext* context) {
EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
if (beta == 0) {
// In Caffe2 we often do a lazy initialization, which may contain NaNs in
@@ -215,22 +205,19 @@ void Gemv<float, CPUContext>(
}
}
-#define CAFFE2_SPECIALIZED_SCALE(T) \
- template <> \
- void Scale<T, CPUContext>( \
- const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
- EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \
- } \
- template <> \
- void Scale<T, CPUContext>( \
- const int n, \
- const float* alpha, \
- const T* x, \
- T* y, \
- CPUContext* context) { \
- EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha); \
+#define CAFFE2_SPECIALIZED_SCALE(T) \
+ template <> \
+ void Scale<T, CPUContext>( \
+ const int n, const T alpha, const T* x, T* y, CPUContext* context) { \
+ EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \
+ } \
+ template <> \
+ void Scale<T, CPUContext>( \
+ const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
+ EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha); \
}
CAFFE2_SPECIALIZED_SCALE(float)
+CAFFE2_SPECIALIZED_SCALE(double)
#undef CAFFE2_SPECIALIZED_SCALE
#define CAFFE2_SPECIALIZED_DOT(T) \
@@ -241,6 +228,7 @@ void Dot<T, CPUContext>( \
*y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
}
CAFFE2_SPECIALIZED_DOT(float)
+CAFFE2_SPECIALIZED_DOT(double)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T) \
@@ -255,6 +243,7 @@ CAFFE2_SPECIALIZED_DOT(float)
EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \
}
CAFFE2_SPECIALIZED_AXPY(float)
+CAFFE2_SPECIALIZED_AXPY(double)
#undef CAFFE2_SPECIALIZED_AXPY
#define CAFFE2_SPECIALIZED_AXPBY(T) \
@@ -265,24 +254,16 @@ void Axpby<T, CPUContext>(const int N, const T alpha, const T* x, \
y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
}
CAFFE2_SPECIALIZED_AXPBY(float)
+CAFFE2_SPECIALIZED_AXPBY(double)
#undef CAFFE2_SPECIALIZED_AXPBY
#else // CAFFE2_USE_EIGEN_FOR_BLAS
template <>
void Gemm<float, CPUContext>(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const float* A,
- const float* B,
- const float beta,
- float* C,
- CPUContext* context,
- TensorProto::DataType math_type) {
+ const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+ const int M, const int N, const int K, const float alpha, const float* A,
+ const float* B, const float beta, float* C, CPUContext* context) {
int lda = (TransA == CblasNoTrans) ? K : M;
int ldb = (TransB == CblasNoTrans) ? N : K;
cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
@@ -311,39 +292,29 @@ void GemmEx<float, CPUContext>(
template <>
void Gemv<float, CPUContext>(
- const CBLAS_TRANSPOSE TransA,
- const int M,
- const int N,
- const float alpha,
- const float* A,
- const float* x,
- const float beta,
- float* y,
- CPUContext* context,
- TensorProto::DataType math_type) {
+ const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
+ const float* A, const float* x, const float beta, float* y,
+ CPUContext* context) {
cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
}
-#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \
- template <> \
- void Scale<T, CPUContext>( \
- const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
- if (y != x) \
- cblas_##prefix##copy(n, x, 1, y, 1); \
- cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1); \
- } \
- template <> \
- void Scale<T, CPUContext>( \
- const int n, \
- const float* alpha, \
- const T* x, \
- T* y, \
- CPUContext* context) { \
- if (y != x) \
- cblas_##prefix##copy(n, x, 1, y, 1); \
- cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1); \
+#define CAFFE2_SPECIALIZED_SCALE(T, prefix) \
+ template <> \
+ void Scale<T, CPUContext>( \
+ const int n, const T alpha, const T* x, T* y, CPUContext* context) { \
+ if (y != x) \
+ cblas_##prefix##copy(n, x, 1, y, 1); \
+ cblas_##prefix##scal(n, alpha, y, 1); \
+ } \
+ template <> \
+ void Scale<T, CPUContext>( \
+ const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
+ if (y != x) \
+ cblas_##prefix##copy(n, x, 1, y, 1); \
+ cblas_##prefix##scal(n, *alpha, y, 1); \
}
CAFFE2_SPECIALIZED_SCALE(float, s)
+CAFFE2_SPECIALIZED_SCALE(double, d)
#undef CAFFE2_SPECIALIZED_SCALE
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
@@ -354,6 +325,7 @@ void Dot<T, CPUContext>( \
*y = cblas_##prefix##dot(N, a, 1, b, 1); \
}
CAFFE2_SPECIALIZED_DOT(float, s)
+CAFFE2_SPECIALIZED_DOT(double, d)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
@@ -368,6 +340,7 @@ CAFFE2_SPECIALIZED_DOT(float, s)
cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
}
CAFFE2_SPECIALIZED_AXPY(float, s)
+CAFFE2_SPECIALIZED_AXPY(double, d)
#undef CAFFE2_SPECIALIZED_AXPY
// cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
@@ -389,6 +362,7 @@ void Axpby<T, CPUContext>(const int N, const T alpha, const T* x, \
}
#endif // CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_AXPBY(float, s)
+CAFFE2_SPECIALIZED_AXPBY(double, d)
#undef CAFFE2_SPECIALIZED_AXPBY
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
@@ -462,8 +436,11 @@ void Funcname<T, CPUContext>(const int N, const T* x, T* y, \
EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr(); \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define DELEGATE_POWX_FUNCTION(T) \
@@ -473,6 +450,7 @@ void Powx<T, CPUContext>( \
EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b); \
}
DELEGATE_POWX_FUNCTION(float)
+DELEGATE_POWX_FUNCTION(double)
#undef DELEGATE_POWX_FUNCTION
#endif // CAFFE2_USE_MKL
@@ -498,6 +476,7 @@ EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \
+EIGEN_SIMPLE_BINARY_FUNCTION(double, Funcname, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
@@ -567,6 +546,7 @@ CAFFE2_SPECIALIZED_COLWISEMAX(float)
DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \
+ DELEGATE_BROADCAST_BINARY_FUNCTION(double, name, op)
DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
@@ -622,6 +602,7 @@ CAFFE2_SPECIALIZED_SET(uint16_t);
#define CAFFE2_DEFINE_BINARY_OP(name, op) \
CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \
+ CAFFE2_INSTANTIATE_BINARY_OP(name, op, double) \
CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)
@@ -663,6 +644,7 @@ void Not<bool, CPUContext>(
}
CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
+CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(double);
#undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
template <>
@@ -735,6 +717,7 @@ void RandGaussian<float, CPUContext>(
}
CAFFE2_SPECIALIZED_SUM(float);
+CAFFE2_SPECIALIZED_SUM(double);
CAFFE2_SPECIALIZED_SUM(int32_t);
CAFFE2_SPECIALIZED_SUM(int64_t);
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 46c5bc02f1..cfbe91b5d0 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -5,9 +5,8 @@
#include <thrust/system/cuda/detail/par.h>
#include <thrust/version.h>
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math.h"
+#include "caffe2/core/context_gpu.h"
#if THRUST_VERSION >= 100800
#define THRUST_SUPPORTS_PER_THREAD
@@ -33,30 +32,33 @@ void Funcname<T, CUDAContext>( \
}
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Exp, exp);
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Log, log);
__device__ float cuda_sqrf(const float x) { return x * x; }
+__device__ double cuda_sqr(const double x) { return x * x; }
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf);
+DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sqr, cuda_sqr);
#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
-#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr) \
- __global__ void _Kernel_##T##_##Funcname( \
- const int N, const T* a, const T* b, T* y) { \
- CUDA_1D_KERNEL_LOOP(i, N) { \
- float r = convert::To<T, float>(a[i]) expr convert::To<T, float>(b[i]); \
- y[i] = convert::To<float, T>(r); \
- } \
- } \
- template <> \
- void Funcname<T, CUDAContext>( \
- const int N, const T* a, const T* b, T* y, CUDAContext* context) { \
- _Kernel_##T##_##Funcname<<< \
- CAFFE_GET_BLOCKS(N), \
- CAFFE_CUDA_NUM_THREADS, \
- 0, \
- context->cuda_stream()>>>(N, a, b, y); \
+#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr) \
+ __global__ void _Kernel_##T##_##Funcname( \
+ const int N, const T* a, const T* b, T* y) { \
+ CUDA_1D_KERNEL_LOOP(i, N) { \
+ y[i] = a[i] expr b[i]; \
+ } \
+ } \
+ template <> \
+ void Funcname<T, CUDAContext>( \
+ const int N, const T* a, const T* b, T* y, CUDAContext* context) { \
+ _Kernel_##T##_##Funcname<<< \
+ CAFFE_GET_BLOCKS(N), \
+ CAFFE_CUDA_NUM_THREADS, \
+ 0, \
+ context->cuda_stream()>>>(N, a, b, y); \
}
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, +);
@@ -64,27 +66,13 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, -);
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, *);
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, /);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Add, +);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Sub, -);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Mul, *);
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, Div, /);
-
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
template <>
void Gemm<float, CUDAContext>(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const float* A,
- const float* B,
- const float beta,
- float* C,
- CUDAContext* context,
- TensorProto::DataType math_type) {
+ const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+ const int M, const int N, const int K, const float alpha, const float* A,
+ const float* B, const float beta, float* C, CUDAContext* context) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (TransA == CblasNoTrans) ? K : M;
@@ -111,91 +99,11 @@ void Gemm<float, CUDAContext>(
}
template <>
-void Gemm<float16, CUDAContext>(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const float16* A,
- const float16* B,
- const float beta,
- float16* C,
- CUDAContext* context,
- TensorProto::DataType math_type) {
- // Note that cublas follows fortran order, so the order is different from
- // the cblas convention.
- int lda = (TransA == CblasNoTrans) ? K : M;
- int ldb = (TransB == CblasNoTrans) ? N : K;
- cublasOperation_t cuTransA =
- (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
- cublasOperation_t cuTransB =
- (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
- if (math_type == TensorProto_DataType_FLOAT) {
- CUBLAS_CHECK(cublasSgemmEx(
- context->cublas_handle(),
- cuTransB,
- cuTransA,
- N,
- M,
- K,
- &alpha,
- B,
- CUDA_R_16F,
- ldb,
- A,
- CUDA_R_16F,
- lda,
- &beta,
- C,
- CUDA_R_16F,
- N));
-
- } else if (math_type == TensorProto_DataType_FLOAT16) {
- // convert alpha, beta from caffe2::float16 -> __half
- __half alpha_fp16;
- alpha_fp16.x = convert::To<float, float16>(alpha).x;
- __half beta_fp16;
- beta_fp16.x = convert::To<float, float16>(beta).x;
- // call cublasHgemm
- CUBLAS_CHECK(cublasHgemm(
- context->cublas_handle(),
- cuTransB,
- cuTransA,
- N,
- M,
- K,
- &alpha_fp16,
- (const __half*)B,
- ldb,
- (const __half*)A,
- lda,
- &beta_fp16,
- (__half*)C,
- N));
- } else {
- // fail
- CAFFE_THROW("Unsupported math type");
- }
-}
-
-template <>
void GemmEx<float, CUDAContext>(
- const CBLAS_TRANSPOSE TransA,
- const CBLAS_TRANSPOSE TransB,
- const int M,
- const int N,
- const int K,
- const float alpha,
- const float* A,
- const int lda,
- const float* B,
- const int ldb,
- const float beta,
- float* C,
- const int ldc,
- CUDAContext* context) {
+ const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
+ const int M, const int N, const int K, const float alpha, const float* A,
+ const int lda, const float* B, const int ldb, const float beta, float* C,
+ const int ldc, CUDAContext* context) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA =
@@ -221,19 +129,40 @@ void GemmEx<float, CUDAContext>(
template <>
void Gemv<float, CUDAContext>(
+ const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
+ const float* A, const float* x, const float beta, float* y,
+ CUDAContext* context) {
+ cublasOperation_t cuTransA =
+ (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
+ CUBLAS_ENFORCE(cublasSgemv(
+ context->cublas_handle(),
+ cuTransA,
+ N,
+ M,
+ &alpha,
+ A,
+ N,
+ x,
+ 1,
+ &beta,
+ y,
+ 1));
+}
+
+template <>
+void Gemv<double, CUDAContext>(
const CBLAS_TRANSPOSE TransA,
const int M,
const int N,
- const float alpha,
- const float* A,
- const float* x,
- const float beta,
- float* y,
- CUDAContext* context,
- TensorProto::DataType math_type) {
+ const double alpha,
+ const double* A,
+ const double* x,
+ const double beta,
+ double* y,
+ CUDAContext* context) {
cublasOperation_t cuTransA =
(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
- CUBLAS_ENFORCE(cublasSgemv(
+ CUBLAS_ENFORCE(cublasDgemv(
context->cublas_handle(),
cuTransA,
N,
@@ -287,73 +216,6 @@ CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(double);
#undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
-template <>
-void Gemv<float16, CUDAContext>(
- const CBLAS_TRANSPOSE TransA,
- const int M,
- const int N,
- const float alpha,
- const float16* A,
- const float16* x,
- const float beta,
- float16* y,
- CUDAContext* context,
- TensorProto::DataType math_type) {
- cublasOperation_t cuTransA =
- (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
-
- // sort out what we need to call cublasSgemmEx / cublasHgemm
- int m = (cuTransA == CUBLAS_OP_N) ? N : M;
- int k = (cuTransA == CUBLAS_OP_N) ? M : N;
- int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
- int LDC = m;
-
- if (math_type == TensorProto_DataType_FLOAT) {
- CUBLAS_CHECK(cublasSgemmEx(
- context->cublas_handle(),
- cuTransA,
- CUBLAS_OP_N,
- m,
- 1,
- k,
- &alpha,
- A,
- CUDA_R_16F,
- LDA,
- x,
- CUDA_R_16F,
- k,
- &beta,
- y,
- CUDA_R_16F,
- LDC));
- } else if (math_type == TensorProto_DataType_FLOAT16) {
- __half alpha_fp16;
- alpha_fp16.x = convert::To<float, float16>(alpha).x;
- __half beta_fp16;
- beta_fp16.x = convert::To<float, float16>(beta).x;
-
- CUBLAS_CHECK(cublasHgemm(
- context->cublas_handle(),
- cuTransA,
- CUBLAS_OP_N,
- m,
- 1,
- k,
- &alpha_fp16,
- (const __half*)A,
- LDA,
- (const __half*)x,
- k,
- &beta_fp16,
- (__half*)y,
- LDC));
- } else {
- // fail
- CAFFE_THROW("Unsupported math type");
- }
-}
-
namespace {
template <typename T>
__global__ void SetKernel(const int N, const T alpha, T* Y) {
@@ -376,7 +238,6 @@ CAFFE2_SPECIALIZED_CUDA_SET(double);
CAFFE2_SPECIALIZED_CUDA_SET(bool);
CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
-CAFFE2_SPECIALIZED_CUDA_SET(float16);
CAFFE2_SPECIALIZED_CUDA_SET(int);
CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
CAFFE2_SPECIALIZED_CUDA_SET(char);
@@ -386,11 +247,11 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
namespace {
template <typename T>
-__global__ void
-UniformShift(const int N, const float min, const float max, T* x) {
- float scale = max - min;
+__global__ void UniformShift(const int N, const T min, const T max,
+ T* x) {
+ T scale = max - min;
CUDA_1D_KERNEL_LOOP(i, N) {
- x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
+ x[i] = x[i] * scale + min;
}
}
@@ -475,6 +336,7 @@ void RandGaussian<double, CUDAContext>(
context->curand_generator(), r, even_n, mean, std));
}
+
template<>
void Dot<float, CUDAContext>(
const int n, const float* a, const float* b, float* y,
@@ -484,28 +346,13 @@ void Dot<float, CUDAContext>(
context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
}
-template <>
-void Dot<float16, CUDAContext>(
- const int n,
- const float16* a,
- const float16* b,
- float16* y,
+template<>
+void Dot<double, CUDAContext>(
+ const int n, const double* a, const double* b, double* y,
CUDAContext* context) {
- float16 result;
- // execute with 32-bit math
- CUBLAS_CHECK(cublasDotEx(
- context->cublas_handle(),
- n,
- a,
- CUDA_R_16F,
- 1,
- b,
- CUDA_R_16F,
- 1,
- &result,
- CUDA_R_16F,
- CUDA_R_32F));
- context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
+ double result;
+ CUBLAS_ENFORCE(cublasDdot(context->cublas_handle(), n, a, 1, b, 1, y));
+ context->Copy<double, CPUContext, CUDAContext>(1, &result, y);
}
// A previous version of caffe2 used Thrust but it turns out that thrust
@@ -516,7 +363,7 @@ void Dot<float16, CUDAContext>(
template <typename T>
__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
const int idx = threadIdx.x;
- __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
+ __shared__ T reduction_buffer[SUM_KERNEL_NTHREADS];
reduction_buffer[idx] = 0;
@@ -524,12 +371,11 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
// N -> 128
if (!square) {
for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
- reduction_buffer[idx] += convert::To<T, float>(X[i]);
+ reduction_buffer[idx] += X[i];
}
} else {
for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
- float Xi = convert::To<T, float>(X[i]);
- reduction_buffer[idx] += Xi * Xi;
+ reduction_buffer[idx] += X[i] * X[i];
}
}
__syncthreads();
@@ -547,7 +393,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
for (int i = 0; i < 32; ++i) {
tmp += reduction_buffer[i];
}
- *Y = convert::To<float, T>(tmp);
+ *Y = tmp;
}
}
@@ -560,7 +406,7 @@ __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
}
CAFFE2_MATH_SUM_FUNC(float)
-CAFFE2_MATH_SUM_FUNC(float16)
+CAFFE2_MATH_SUM_FUNC(double)
#undef CAFFE2_MATH_SUM_FUNC
#define CAFFE2_MATH_SUMSQR_FUNC(T) \
@@ -592,33 +438,18 @@ void Select<float, CUDAContext>(
0, context->cuda_stream()>>>(N, D, x, idx, y);
}
-template <>
-void Select<float16, CUDAContext>(
- const int N,
- const int D,
- const float16* x,
- const int* idx,
- float16* y,
- CUDAContext* context) {
- SelectKernel<float16><<<
- CAFFE_GET_BLOCKS(N),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context->cuda_stream()>>>(N, D, x, idx, y);
-}
-
namespace {
template <typename T>
-__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
+__global__ void ScaleKernel(
+ const int n, const T alpha, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
- // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
- y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
+ y[i] = x[i] * alpha;
}
}
template <typename T>
-__global__ void
-ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
+__global__ void ScaleKernelDeviceAlpha(
+ const int n, const T* alpha, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(i, n) {
y[i] = x[i] * (*alpha);
}
@@ -630,20 +461,6 @@ __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
y[i] = powf(x[i], exponent);
}
}
-
-// fp16 specialization
-template <>
-__global__ void ScaleKernelDeviceAlpha(
- const int n,
- const float* alpha,
- const float16* x,
- float16* y) {
- CUDA_1D_KERNEL_LOOP(i, n) {
- y[i] = convert::To<float, float16>(
- convert::To<float16, float>(x[i]) * (*alpha));
- }
-}
-
} // namespace
template <>
@@ -672,17 +489,12 @@ void Scale<float, CUDAContext>(
}
template <>
-void Scale<float16, CUDAContext>(
- const int n,
- const float alpha,
- const float16* x,
- float16* y,
+void Scale<double, CUDAContext>(
+ const int n, const double alpha, const double *x, double* y,
CUDAContext* context) {
- ScaleKernel<float16><<<
- CAFFE_GET_BLOCKS(n),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context->cuda_stream()>>>(n, alpha, x, y);
+ ScaleKernel<double><<<
+ CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
+ n, alpha, x, y);
}
template <>
@@ -695,17 +507,11 @@ void Scale<float, CUDAContext>(
}
template <>
-void Scale<float16, CUDAContext>(
- const int n,
- const float* alpha,
- const float16* x,
- float16* y,
+void Scale<double, CUDAContext>(
+ const int n, const double* alpha, const double *x, double* y,
CUDAContext* context) {
- ScaleKernelDeviceAlpha<float16><<<
- CAFFE_GET_BLOCKS(n),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context->cuda_stream()>>>(n, alpha, x, y);
+ ScaleKernelDeviceAlpha<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+ 0, context->cuda_stream()>>>(n, alpha, x, y);
}
template <>
@@ -721,42 +527,18 @@ void Axpy<float, CUDAContext>(
template <>
void Axpy<double, CUDAContext>(
const int N,
- const float alpha,
+ const double alpha,
const double* X,
double* Y,
CUDAContext* context) {
- double alpha_d{alpha};
- CUBLAS_ENFORCE(
- cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
-}
-
-template <>
-void Axpy<float16, CUDAContext>(
- const int N,
- const float alpha,
- const float16* X,
- float16* Y,
- CUDAContext* context) {
- CUBLAS_CHECK(cublasAxpyEx(
- context->cublas_handle(),
- N,
- &alpha,
- CUDA_R_16F,
- X,
- CUDA_R_16F,
- 1,
- Y,
- CUDA_R_16F,
- 1,
- CUDA_R_32F));
+ CUBLAS_ENFORCE(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
}
namespace {
template <typename T>
-__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
+__global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) {
CUDA_1D_KERNEL_LOOP(index, n) {
- y[index] = convert::Get<T>(
- convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
+ y[index] += x[index] * (*a);
}
}
} // namespace
@@ -770,19 +552,14 @@ void Axpy<float, CUDAContext>(
}
template <>
-void Axpy<float16, CUDAContext>(
- const int n,
- const float* alpha,
- const float16* X,
- float16* Y,
- CUDAContext* context) {
- AxpyKernel<float16><<<
- CAFFE_GET_BLOCKS(n),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context->cuda_stream()>>>(n, alpha, X, Y);
+void Axpy<double, CUDAContext>(
+ const int n, const double* alpha, const double* X,
+ double* Y, CUDAContext* context) {
+ AxpyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+ 0, context->cuda_stream()>>>(n, alpha, X, Y);
}
+
namespace {
template <typename T>
__global__ void AxpbyKernel(const int n, const T a, const T* x,
@@ -801,6 +578,14 @@ void Axpby<float, CUDAContext>(
0, context->cuda_stream()>>>(n, a, x, b, y);
}
+template <>
+void Axpby<double, CUDAContext>(
+ const int n, const double a, const double* x, const double b, double* y,
+ CUDAContext* context) {
+ AxpbyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS,
+ 0, context->cuda_stream()>>>(n, a, x, b, y);
+}
+
namespace {
template <typename T>
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index b1f930bec0..2ceeddd355 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -67,4 +67,61 @@ TEST(MathUtilGPUTest, testAddStripedBatch) {
}
}
+#define TEST_GEMV_WITH_TYPE(field_name) \
+ TEST(MathUtilGPUTest, testGemv_##field_name) { \
+ if (!HasCudaGPU()) \
+ return; \
+ Workspace ws; \
+ DeviceOption option; \
+ option.set_device_type(CUDA); \
+ CUDAContext context(option); \
+ Blob* blobx = ws.CreateBlob("X"); \
+ Blob* bloby = ws.CreateBlob("Y"); \
+ Blob* blobz = ws.CreateBlob("Z"); \
+ Blob* bloby_host = ws.CreateBlob("Y_host"); \
+ \
+ vector<int> shapex{64, 128}; \
+ vector<int> shapey{64}; \
+ vector<int> shapez{128}; \
+ \
+ auto* tensorx = blobx->GetMutable<Tensor<CUDAContext>>(); \
+ tensorx->Resize(shapex); \
+ math::Set<field_name, CUDAContext>( \
+ 64 * 128, \
+ (field_name)1.0, \
+ tensorx->mutable_data<field_name>(), \
+ &context); \
+ \
+ auto* tensory = bloby->GetMutable<Tensor<CUDAContext>>(); \
+ tensory->Resize(shapey); \
+ math::Set<field_name, CUDAContext>( \
+ 64, (field_name)1.0, tensory->mutable_data<field_name>(), &context); \
+ \
+ auto* tensorz = blobz->GetMutable<Tensor<CUDAContext>>(); \
+ tensorz->Resize(shapez); \
+ \
+ math::Gemv<field_name, CUDAContext>( \
+ CblasTrans, \
+ 64, \
+ 128, \
+ 1.0, \
+ tensorx->template data<field_name>(), \
+ tensory->mutable_data<field_name>(), \
+ 0.0, \
+ tensorz->template mutable_data<field_name>(), \
+ &context); \
+ context.FinishDeviceComputation(); \
+ \
+ auto* tensory_host = bloby_host->GetMutable<Tensor<CPUContext>>(); \
+ tensory_host->CopyFrom<CUDAContext, CUDAContext>(*tensorz, &context); \
+ context.FinishDeviceComputation(); \
+ \
+ for (int i = 0; i < 128; i++) { \
+ EXPECT_EQ(tensory_host->data<field_name>()[i], 64.0); \
+ } \
+ }
+
+TEST_GEMV_WITH_TYPE(float);
+TEST_GEMV_WITH_TYPE(double);
+
} // namespace caffe2