summaryrefslogtreecommitdiff
path: root/caffe2
diff options
context:
space:
mode:
authorXiaomeng Yang <yangxm@fb.com>2019-02-07 18:19:46 -0800
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-02-07 18:38:26 -0800
commit2db847b3a7edc48652e144e7c9d7aa0bbed66aaa (patch)
tree3d95b2f625ca878f038092dea671a2bffed088c6 /caffe2
parent22477c6a7fc327cdf913751ab31b788ec710caa9 (diff)
downloadpytorch-2db847b3a7edc48652e144e7c9d7aa0bbed66aaa.tar.gz
pytorch-2db847b3a7edc48652e144e7c9d7aa0bbed66aaa.tar.bz2
pytorch-2db847b3a7edc48652e144e7c9d7aa0bbed66aaa.zip
Separate elementwise level2 math functions (#16753)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16753 Separate elementwise level2 math functions i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13954928 fbshipit-source-id: 1ca7a5d3da96e32510f502e5e4e79168854bee67
Diffstat (limited to 'caffe2')
-rw-r--r--caffe2/operators/conv_transpose_op_mobile_impl.h2
-rw-r--r--caffe2/operators/layer_norm_op.cu2
-rw-r--r--caffe2/operators/minmax_gradient_ops.cc74
-rw-r--r--caffe2/operators/minmax_ops.cc33
-rw-r--r--caffe2/operators/minmax_ops.cu56
-rw-r--r--caffe2/operators/minmax_ops.h115
-rw-r--r--caffe2/operators/rsqrt_op.cu2
-rw-r--r--caffe2/operators/utility_ops.cu104
-rw-r--r--caffe2/quantization/server/im2col_dnnlowp.h2
-rw-r--r--caffe2/utils/CMakeLists.txt5
-rw-r--r--caffe2/utils/math.h71
-rw-r--r--caffe2/utils/math/broadcast.cc55
-rw-r--r--caffe2/utils/math/broadcast.cu108
-rw-r--r--caffe2/utils/math/broadcast.h24
-rw-r--r--caffe2/utils/math/elementwise.cc273
-rw-r--r--caffe2/utils/math/elementwise.cu486
-rw-r--r--caffe2/utils/math/elementwise.h101
-rw-r--r--caffe2/utils/math/half_utils.h49
-rw-r--r--caffe2/utils/math/reduce.cc2
-rw-r--r--caffe2/utils/math/reduce.cu2
-rw-r--r--caffe2/utils/math/utils.cc (renamed from caffe2/utils/math_utils.cc)2
-rw-r--r--caffe2/utils/math/utils.h (renamed from caffe2/utils/math_utils.h)8
-rw-r--r--caffe2/utils/math_cpu.cc111
-rw-r--r--caffe2/utils/math_gpu.cu70
-rw-r--r--caffe2/utils/math_gpu_test.cc18
25 files changed, 1038 insertions, 737 deletions
diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h
index f586453d5c..6869f5178d 100644
--- a/caffe2/operators/conv_transpose_op_mobile_impl.h
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@@ -18,7 +18,7 @@
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/math.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
C10_DECLARE_bool(caffe2_force_shared_col_buffer);
diff --git a/caffe2/operators/layer_norm_op.cu b/caffe2/operators/layer_norm_op.cu
index c3465c1d2d..440783c6eb 100644
--- a/caffe2/operators/layer_norm_op.cu
+++ b/caffe2/operators/layer_norm_op.cu
@@ -4,7 +4,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/math.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
diff --git a/caffe2/operators/minmax_gradient_ops.cc b/caffe2/operators/minmax_gradient_ops.cc
index 9c7df22d55..d288eb0946 100644
--- a/caffe2/operators/minmax_gradient_ops.cc
+++ b/caffe2/operators/minmax_gradient_ops.cc
@@ -1,66 +1,66 @@
#include "caffe2/operators/minmax_ops.h"
-#include "caffe2/utils/eigen_utils.h"
-namespace caffe2 {
+#include <string>
+#include <vector>
-REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
-REGISTER_CPU_OPERATOR(MinGradient, MinGradientOp<float, CPUContext>);
+#include "caffe2/utils/eigen_utils.h"
-OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
-OPERATOR_SCHEMA(MinGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+namespace caffe2 {
template <typename T, class Context>
bool SelectGradientOpBase<T, Context>::RunOnDevice() {
- auto& output = Input(0);
- auto& grad_output = Input(1);
- const int kInputStartOffset = 2;
-
- const T* data = output.template data<T>();
- ConstEigenArrayMap<T> output_array(
- output.template data<T>(), 1, output.numel());
- ConstEigenArrayMap<T> grad_out_array(
- grad_output.template data<T>(), 1, grad_output.numel());
-
+ const auto& Y = Input(0);
+ const auto& dY = Input(1);
+ const int N = Y.numel();
+ ConstEigenVectorArrayMap<T> Y_arr(Y.template data<T>(), N);
+ ConstEigenVectorArrayMap<T> dY_arr(dY.template data<T>(), N);
for (int i = 0; i < OutputSize(); i++) {
- auto& input = Input(i + kInputStartOffset);
- ConstEigenArrayMap<T> input_array(
- input.template data<T>(), 1, input.numel());
-
- auto* grad_input = Output(i, input.sizes(), at::dtype<T>());
- EigenArrayMap<T> grad_in_array(
- grad_input->template mutable_data<T>(), 1, grad_input->numel());
- grad_in_array = grad_out_array *
- input_array.cwiseEqual(output_array).template cast<T>();
+ const auto& Xi = Input(i + 2);
+ auto* dXi = Output(i, Xi.sizes(), at::dtype<T>());
+ ConstEigenVectorArrayMap<T> Xi_arr(Xi.template data<T>(), N);
+ EigenVectorArrayMap<T> dXi_arr(dXi->template mutable_data<T>(), N);
+ dXi_arr = (Xi_arr == Y_arr).template cast<T>() * dY_arr;
}
return true;
}
+REGISTER_CPU_OPERATOR(MaxGradient, MaxGradientOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(MinGradient, MinGradientOp<float, CPUContext>);
+
+OPERATOR_SCHEMA(MaxGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+OPERATOR_SCHEMA(MinGradient).NumInputs(3, INT_MAX).NumOutputs(1, INT_MAX);
+
+namespace {
+
class GetMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
- vector<OperatorDef> GetGradientDefs() override {
- auto gradInputs = vector<string>();
- auto inputs = vector<string>{O(0), GO(0)};
- for (int i = 0; i < def_.input_size(); i++) {
- gradInputs.push_back(GI(i));
+ std::vector<OperatorDef> GetGradientDefs() override {
+ std::vector<std::string> inputs = {O(0), GO(0)};
+ std::vector<std::string> grad_inputs;
+ for (int i = 0; i < def_.input_size(); ++i) {
inputs.push_back(I(i));
+ grad_inputs.push_back(GI(i));
}
- return SingleGradientDef("MaxGradient", "", inputs, gradInputs);
+ return SingleGradientDef("MaxGradient", "", inputs, grad_inputs);
}
};
-REGISTER_GRADIENT(Max, GetMaxGradient);
class GetMinGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
- auto gradInputs = vector<string>();
- auto inputs = vector<string>{O(0), GO(0)};
- for (int i = 0; i < def_.input_size(); i++) {
- gradInputs.push_back(GI(i));
+ std::vector<std::string> inputs = {O(0), GO(0)};
+ std::vector<std::string> grad_inputs;
+ for (int i = 0; i < def_.input_size(); ++i) {
inputs.push_back(I(i));
+ grad_inputs.push_back(GI(i));
}
- return SingleGradientDef("MinGradient", "", inputs, gradInputs);
+ return SingleGradientDef("MinGradient", "", inputs, grad_inputs);
}
};
+
+} // namespace
+
+REGISTER_GRADIENT(Max, GetMaxGradient);
REGISTER_GRADIENT(Min, GetMinGradient);
} // namespace caffe2
diff --git a/caffe2/operators/minmax_ops.cc b/caffe2/operators/minmax_ops.cc
index 1a105770d2..3dd2487335 100644
--- a/caffe2/operators/minmax_ops.cc
+++ b/caffe2/operators/minmax_ops.cc
@@ -1,10 +1,9 @@
#include "caffe2/operators/minmax_ops.h"
-#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
-REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(Min, MinOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(Max, MaxOp<float, CPUContext>);
OPERATOR_SCHEMA(Max)
.NumInputs(1, INT_MAX)
@@ -155,34 +154,4 @@ Min:
"Contains the minimum valued element at each location.")
.InheritOnnxSchema();
-template <typename T, class Context>
-bool MaxOp<T, Context>::Compute() {
- auto& input0 = Input(0);
- const int N = input0.numel();
- T* output_data = Output(0)->template mutable_data<T>();
-
- for (int i = 1; i < InputSize(); i++) {
- auto input_data = Input(i).template data<T>();
- EigenVectorMap<T> output_vec(output_data, N);
- output_vec = output_vec.cwiseMax(ConstEigenVectorMap<T>(input_data, N));
- }
-
- return true;
-}
-
-template <typename T, class Context>
-bool MinOp<T, Context>::Compute() {
- auto& input0 = Input(0);
- const int N = input0.numel();
- T* output_data = Output(0)->template mutable_data<T>();
-
- for (int i = 1; i < InputSize(); i++) {
- auto input_data = Input(i).template data<T>();
- EigenVectorMap<T> output_vec(output_data, N);
- output_vec = output_vec.cwiseMin(ConstEigenVectorMap<T>(input_data, N));
- }
-
- return true;
-}
-
} // namespace caffe2
diff --git a/caffe2/operators/minmax_ops.cu b/caffe2/operators/minmax_ops.cu
new file mode 100644
index 0000000000..a853b700d1
--- /dev/null
+++ b/caffe2/operators/minmax_ops.cu
@@ -0,0 +1,56 @@
+#include "caffe2/operators/minmax_ops.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+template <typename T>
+__global__ void SelectGradientCUDAKernel(
+ const int N,
+ const T* dY,
+ const T* X,
+ const T* Y,
+ T* dX) {
+ const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (i < N) {
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+ dX[i] = __ldg(X + i) == __ldg(Y + i) ? __ldg(dY + i) : T(0);
+#else
+ dX[i] = X[i] == Y[i] ? dY[i] : T(0);
+#endif
+ }
+}
+
+} // namespace
+
+template <>
+bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
+ const auto& Y = Input(0);
+ const auto& dY = Input(1);
+ const int N = Y.numel();
+ const int M = math::DivUp(N, CAFFE_CUDA_NUM_THREADS);
+ const float* dY_data = dY.data<float>();
+ const float* Y_data = Y.data<float>();
+ for (int i = 0; i < OutputSize(); i++) {
+ const auto& Xi = Input(i + 2);
+ auto* dXi = Output(i, Xi.sizes(), at::dtype<float>());
+ const float* Xi_data = Xi.data<float>();
+ float* dXi_data = dXi->mutable_data<float>();
+ if (N > 0) {
+ SelectGradientCUDAKernel<float>
+ <<<M, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
+ N, dY_data, Xi_data, Y_data, dXi_data);
+ }
+ }
+ return true;
+}
+
+REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
diff --git a/caffe2/operators/minmax_ops.h b/caffe2/operators/minmax_ops.h
index db02e0dcb0..5668460682 100644
--- a/caffe2/operators/minmax_ops.h
+++ b/caffe2/operators/minmax_ops.h
@@ -1,7 +1,6 @@
#ifndef CAFFE2_OPERATORS_MINMAX_OPS_H_
#define CAFFE2_OPERATORS_MINMAX_OPS_H_
-#include "caffe2/core/common_omp.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
@@ -11,49 +10,99 @@
namespace caffe2 {
template <typename T, class Context>
-class MaxMinOpBase : public Operator<Context> {
+class MaxOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
- USE_SIMPLE_CTOR_DTOR(MaxMinOpBase)
- bool RunOnDevice() override {
- auto& input0 = Input(0);
- auto* output = Output(0);
-
- output->ResizeLike(input0);
- output->CopyFrom(input0, /* async */ true);
+ USE_SIMPLE_CTOR_DTOR(MaxOp)
+ bool RunOnDevice() override {
+ const auto& X0 = Input(0);
+ auto* Y = Output(0);
+ Y->ResizeLike(X0);
+ const T* X0_data = X0.template data<T>();
+ T* Y_data = Y->template mutable_data<T>();
+ const int N = X0.numel();
if (InputSize() == 1) {
+ if (Y != &X0) {
+ context_.template CopySameDevice<T>(N, X0_data, Y_data);
+ }
return true;
}
-
- // Dimension checking
- for (int i = 1; i < InputSize(); ++i) {
+ const auto& X1 = Input(1);
+ CAFFE_ENFORCE_EQ(
+ X0.sizes(),
+ Y->sizes(),
+ "Description: Input #1, input dimension:",
+ X1.sizes(),
+ " should match output dimension: ",
+ Y->sizes());
+ const T* X1_data = X1.template data<T>();
+ math::Max<T, Context>(N, X0_data, X1_data, Y_data, &context_);
+ for (int i = 2; i < InputSize(); ++i) {
+ const auto& Xi = Input(i);
CAFFE_ENFORCE_EQ(
- output->sizes(),
- Input(i).sizes(),
+ Xi.sizes(),
+ Y->sizes(),
"Description: Input #",
i,
", input dimension:",
Input(i).sizes(),
" should match output dimension: ",
- output->sizes());
+ Y->sizes());
+ const T* Xi_data = Xi.template data<T>();
+ math::Max<T, Context>(N, Y_data, Xi_data, Y_data, &context_);
}
-
- return this->Compute();
+ return true;
}
-
- virtual bool Compute() = 0;
};
template <typename T, class Context>
-class MaxOp : public MaxMinOpBase<T, Context> {
+class MinOp final : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
- MaxOp(const OperatorDef& operator_def, Workspace* ws)
- : MaxMinOpBase<T, Context>(operator_def, ws) {}
- virtual ~MaxOp() noexcept {}
- bool Compute() override;
+
+ USE_SIMPLE_CTOR_DTOR(MinOp)
+
+ bool RunOnDevice() override {
+ const auto& X0 = Input(0);
+ auto* Y = Output(0);
+ Y->ResizeLike(X0);
+ const T* X0_data = X0.template data<T>();
+ T* Y_data = Y->template mutable_data<T>();
+ const int N = X0.numel();
+ if (InputSize() == 1) {
+ if (Y != &X0) {
+ context_.template CopySameDevice<T>(N, X0_data, Y_data);
+ }
+ return true;
+ }
+ const auto& X1 = Input(1);
+ CAFFE_ENFORCE_EQ(
+ X0.sizes(),
+ Y->sizes(),
+ "Description: Input #1, input dimension:",
+ X1.sizes(),
+ " should match output dimension: ",
+ Y->sizes());
+ const T* X1_data = X1.template data<T>();
+ math::Min<T, Context>(N, X0_data, X1_data, Y_data, &context_);
+ for (int i = 2; i < InputSize(); ++i) {
+ const auto& Xi = Input(i);
+ CAFFE_ENFORCE_EQ(
+ Xi.sizes(),
+ Y->sizes(),
+ "Description: Input #",
+ i,
+ ", input dimension:",
+ Input(i).sizes(),
+ " should match output dimension: ",
+ Y->sizes());
+ const T* Xi_data = Xi.template data<T>();
+ math::Min<T, Context>(N, Y_data, Xi_data, Y_data, &context_);
+ }
+ return true;
+ }
};
template <typename T, class Context>
@@ -66,29 +115,21 @@ class SelectGradientOpBase : public Operator<Context> {
};
template <typename T, class Context>
-class MaxGradientOp : public SelectGradientOpBase<T, Context> {
+class MaxGradientOp final : public SelectGradientOpBase<T, Context> {
public:
MaxGradientOp(const OperatorDef& operator_def, Workspace* ws)
: SelectGradientOpBase<T, Context>(operator_def, ws) {}
- virtual ~MaxGradientOp() noexcept {}
-};
-template <typename T, class Context>
-class MinOp : public MaxMinOpBase<T, Context> {
- public:
- USE_OPERATOR_CONTEXT_FUNCTIONS;
- MinOp(const OperatorDef& operator_def, Workspace* ws)
- : MaxMinOpBase<T, Context>(operator_def, ws) {}
- virtual ~MinOp() noexcept {}
- bool Compute() override;
+ ~MaxGradientOp() = default;
};
template <typename T, class Context>
-class MinGradientOp : public SelectGradientOpBase<T, Context> {
+class MinGradientOp final : public SelectGradientOpBase<T, Context> {
public:
MinGradientOp(const OperatorDef& operator_def, Workspace* ws)
: SelectGradientOpBase<T, Context>(operator_def, ws) {}
- virtual ~MinGradientOp() noexcept {}
+
+ ~MinGradientOp() = default;
};
} // namespace caffe2
diff --git a/caffe2/operators/rsqrt_op.cu b/caffe2/operators/rsqrt_op.cu
index 378d131a6f..eae8dc0f44 100644
--- a/caffe2/operators/rsqrt_op.cu
+++ b/caffe2/operators/rsqrt_op.cu
@@ -4,7 +4,7 @@
#include <functional>
#include "caffe2/core/context_gpu.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math.h"
namespace caffe2 {
diff --git a/caffe2/operators/utility_ops.cu b/caffe2/operators/utility_ops.cu
index b7d17ba64f..f767a9ae11 100644
--- a/caffe2/operators/utility_ops.cu
+++ b/caffe2/operators/utility_ops.cu
@@ -1,8 +1,4 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/flatten_op.h"
-#include "caffe2/operators/minmax_ops.h"
#include "caffe2/operators/utility_ops.h"
-#include "caffe2/utils/math.h"
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
@@ -10,6 +6,10 @@
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/unique.h>
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/flatten_op.h"
+#include "caffe2/utils/math.h"
+
namespace caffe2 {
template <>
@@ -137,102 +137,6 @@ bool NanCheckOp<CUDAContext>::RunOnDevice() {
REGISTER_CUDA_OPERATOR(NanCheck, NanCheckOp<CUDAContext>);
-__global__ void
-ElwiseMaxKernel(const float* X, const float* Y, float* maxout, const int N) {
- CUDA_1D_KERNEL_LOOP(i, N) {
- maxout[i] = fmaxf(X[i], Y[i]);
- }
-}
-
-template <>
-bool MaxOp<float, CUDAContext>::Compute() {
- float* output_data = Output(0)->template mutable_data<float>();
- const int N = Input(0).numel();
-
- // Run pairwise-maxes
- for (int i = 1; i < InputSize(); ++i) {
- ElwiseMaxKernel<<<
- CAFFE_GET_BLOCKS(N),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context_.cuda_stream()>>>(
- (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
- Input(i).data<float>(),
- output_data,
- N);
- }
-
- return true;
-}
-
-REGISTER_CUDA_OPERATOR(Max, MaxOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(MaxGradient, MaxGradientOp<float, CUDAContext>);
-
-__global__ void
-ElwiseMinKernel(const float* X, const float* Y, float* minout, const int N) {
- CUDA_1D_KERNEL_LOOP(i, N) {
- minout[i] = fminf(X[i], Y[i]);
- }
-}
-
-template <>
-bool MinOp<float, CUDAContext>::Compute() {
- float* output_data = Output(0)->template mutable_data<float>();
- const int N = Input(0).numel();
-
- // Run pairwise-mines
- for (int i = 1; i < InputSize(); ++i) {
- ElwiseMinKernel<<<
- CAFFE_GET_BLOCKS(N),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context_.cuda_stream()>>>(
- (i == 0 ? Input(0).data<float>() : Output(0)->data<float>()),
- Input(i).data<float>(),
- output_data,
- N);
- }
-
- return true;
-}
-
-REGISTER_CUDA_OPERATOR(Min, MinOp<float, CUDAContext>);
-REGISTER_CUDA_OPERATOR(MinGradient, MinGradientOp<float, CUDAContext>);
-
-template <typename T>
-__global__ void
-MaxMinGradKernel(int N, const T* mx, const T* x, const T* go, T* gi) {
- CUDA_1D_KERNEL_LOOP(i, N) {
- gi[i] = go[i] * (mx[i] == x[i]);
- }
-}
-
-template <>
-bool SelectGradientOpBase<float, CUDAContext>::RunOnDevice() {
- auto& output = Input(0);
- auto& grad_output = Input(1);
- const int kInputStartOffset = 2;
-
- const float* data = output.data<float>();
-
- for (int i = 0; i < OutputSize(); i++) {
- auto& input = Input(i + kInputStartOffset);
-
- auto* grad_input = Output(i, input.sizes(), at::dtype<float>());
- MaxMinGradKernel<<<
- CAFFE_GET_BLOCKS(input.numel()),
- CAFFE_CUDA_NUM_THREADS,
- 0,
- context_.cuda_stream()>>>(
- input.numel(),
- output.data<float>(),
- input.data<float>(),
- grad_output.data<float>(),
- grad_input->template mutable_data<float>());
- }
- return true;
-}
-
/**
* @brief Update slices of Y in-place with a batch of weighted X's.
* Y[idx] = alpha[b] * X[b][i] + Y[idx]
diff --git a/caffe2/quantization/server/im2col_dnnlowp.h b/caffe2/quantization/server/im2col_dnnlowp.h
index fdbee29556..92f7b272ac 100644
--- a/caffe2/quantization/server/im2col_dnnlowp.h
+++ b/caffe2/quantization/server/im2col_dnnlowp.h
@@ -6,7 +6,7 @@
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 9208a69641..eaa10f8865 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,10 +1,11 @@
list(APPEND Caffe2_CPU_SRCS
utils/bench_utils.cc
utils/cpuid.cc
+ utils/math/broadcast.cc
utils/math/elementwise.cc
utils/math/reduce.cc
+ utils/math/utils.cc
utils/math_cpu.cc
- utils/math_utils.cc
utils/murmur_hash3.cc
utils/proto_convert.cc
utils/proto_utils.cc
@@ -26,12 +27,14 @@ if (NOT MSVC)
endif()
set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS}
+ utils/math/broadcast.cu
utils/math/elementwise.cu
utils/math/reduce.cu
utils/math_gpu.cu
)
set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS}
+ utils/math/hip/broadcast.hip
utils/math/hip/elementwise.hip
utils/math/hip/reduce.hip
utils/hip/math_gpu.hip
diff --git a/caffe2/utils/math.h b/caffe2/utils/math.h
index f870c3d0f8..2ea960176a 100644
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@@ -15,9 +15,10 @@ extern "C" {
#include "caffe2/core/common.h"
#include "caffe2/core/types.h"
+#include "caffe2/utils/math/broadcast.h"
#include "caffe2/utils/math/elementwise.h"
#include "caffe2/utils/math/reduce.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
@@ -31,9 +32,6 @@ class CAFFE2_API DefaultEngine {};
namespace math {
#define C10_DECLARE_COMPARE_OP(Comp) \
- template <typename T, class Context> \
- void Comp(const int N, const T* A, const T* B, bool* C, Context* context); \
- \
template <typename T, class Context, bool kBroadcast1st = false> \
void Rowwise##Comp( \
const int rows, \
@@ -72,37 +70,34 @@ C10_DECLARE_COMPARE_OP(GE)
#undef C10_DECLARE_COMPARE_OP
-#define C10_DECLARE_BINARY_OP(Func) \
- template <typename T, class Context> \
- void Func(const int N, const T* A, const T* B, T* C, Context* context); \
- \
- template <typename T, class Context, bool kBroadcast1st = false> \
- void Rowwise##Func( \
- const int rows, \
- const int cols, \
- const T* A, \
- const T* B, \
- T* C, \
- Context* context); \
- \
- template <typename T, class Context, bool kBroadcast1st = false> \
- void Colwise##Func( \
- const int rows, \
- const int cols, \
- const T* A, \
- const T* B, \
- T* C, \
- Context* context); \
- \
- template <typename T, class Context> \
- void Func( \
- const int A_ndim, \
- const int* A_dims, \
- const int B_ndim, \
- const int* B_dims, \
- const T* A, \
- const T* B, \
- T* C, \
+#define C10_DECLARE_BINARY_OP(Func) \
+ template <typename T, class Context, bool kBroadcast1st = false> \
+ void Rowwise##Func( \
+ const int rows, \
+ const int cols, \
+ const T* A, \
+ const T* B, \
+ T* C, \
+ Context* context); \
+ \
+ template <typename T, class Context, bool kBroadcast1st = false> \
+ void Colwise##Func( \
+ const int rows, \
+ const int cols, \
+ const T* A, \
+ const T* B, \
+ T* C, \
+ Context* context); \
+ \
+ template <typename T, class Context> \
+ void Func( \
+ const int A_ndim, \
+ const int* A_dims, \
+ const int B_ndim, \
+ const int* B_dims, \
+ const T* A, \
+ const T* B, \
+ T* C, \
Context* context);
C10_DECLARE_BINARY_OP(Add)
@@ -238,11 +233,6 @@ template <typename T, class Context>
CAFFE2_API void
ColwiseMax(const int N, const int D, const T* x, T* y, Context* context);
-// Elemwise maximum of vector x and vector y. z[i] = max(x[i], y[i])
-template <typename T, class Context>
-CAFFE2_API void
-ElemwiseMax(const int N, const T* x, const T* y, T* z, Context* context);
-
// Elemwise maximum of vector x and scalar alpha. y[i] = max(x[i], alpha)
template <typename T, class Context>
CAFFE2_API void
@@ -621,7 +611,6 @@ CAFFE2_API void NHWC2NCHW(
T* Y,
Context* context);
-
} // namespace math
} // namespace caffe2
diff --git a/caffe2/utils/math/broadcast.cc b/caffe2/utils/math/broadcast.cc
new file mode 100644
index 0000000000..f880bb7eeb
--- /dev/null
+++ b/caffe2/utils/math/broadcast.cc
@@ -0,0 +1,55 @@
+#include "caffe2/utils/math/broadcast.h"
+
+#include "caffe2/core/context.h"
+#include "caffe2/utils/eigen_utils.h"
+
+namespace caffe2 {
+namespace math {
+
+#define CAFFE2_SPECIALIZED_AFFINE_CHANNEL(T) \
+ template <> \
+ C10_EXPORT void AffineChannel<T, CPUContext, StorageOrder::NCHW>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ const T* scale, \
+ const T* bias, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ ConstEigenVectorArrayMap<T> scale_arr(scale, C); \
+ ConstEigenVectorArrayMap<T> bias_arr(bias, C); \
+ const int stride = C * HxW; \
+ const T* X_ptr = X; \
+ T* Y_ptr = Y; \
+ for (int i = 0; i < N; ++i) { \
+ EigenArrayMap<T>(Y_ptr, HxW, C) = \
+ (ConstEigenArrayMap<T>(X_ptr, HxW, C).rowwise() * \
+ scale_arr.transpose()) \
+ .rowwise() + \
+ bias_arr.transpose(); \
+ X_ptr += stride; \
+ Y_ptr += stride; \
+ } \
+ } \
+ template <> \
+ C10_EXPORT void AffineChannel<T, CPUContext, StorageOrder::NHWC>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ const T* scale, \
+ const T* bias, \
+ T* Y, \
+ CPUContext* /* context */) { \
+ EigenArrayMap<T>(Y, C, N * HxW) = \
+ (ConstEigenArrayMap<T>(X, C, N * HxW).colwise() * \
+ ConstEigenVectorArrayMap<T>(scale, C)) \
+ .colwise() + \
+ ConstEigenVectorArrayMap<T>(bias, C); \
+ }
+CAFFE2_SPECIALIZED_AFFINE_CHANNEL(float)
+#undef CAFFE2_SPECIALIZED_AFFINE_CHANNEL
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math/broadcast.cu b/caffe2/utils/math/broadcast.cu
new file mode 100644
index 0000000000..97f7cb500f
--- /dev/null
+++ b/caffe2/utils/math/broadcast.cu
@@ -0,0 +1,108 @@
+#include "caffe2/utils/math/broadcast.h"
+
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+
+namespace {
+
+template <typename T>
+__global__ void AffineChannelNCHWCUDAKernel(
+ const int C,
+ const int M,
+ const int HxW,
+ const T* X,
+ const T* scale,
+ const T* bias,
+ T* Y);
+
+template <>
+__global__ void AffineChannelNCHWCUDAKernel<float>(
+ const int C,
+ const int M,
+ const int HxW,
+ const float* X,
+ const float* scale,
+ const float* bias,
+ float* Y) {
+ const int nc = blockIdx.x / M;
+ const int c = nc % C;
+ const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (w < HxW) {
+ const int index = nc * HxW + w;
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+ Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
+#else
+ Y[index] = fmaf(X[index], scale[c], bias[c]);
+#endif
+ }
+}
+
+template <typename T>
+__global__ void AffineChannelNHWCCUDAKernel(
+ const int C,
+ const T* X,
+ const T* scale,
+ const T* bias,
+ T* Y);
+
+template <>
+__global__ void AffineChannelNHWCCUDAKernel<float>(
+ const int C,
+ const float* X,
+ const float* scale,
+ const float* bias,
+ float* Y) {
+ const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (c < C) {
+ const int index = blockIdx.x * C + c;
+#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
+ Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
+#else
+ Y[index] = fmaf(X[index], scale[c], bias[c]);
+#endif
+ }
+}
+
+} // namespace
+
+#define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ const T* scale, \
+ const T* bias, \
+ T* Y, \
+ CUDAContext* context) { \
+ const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS); \
+ AffineChannelNCHWCUDAKernel<T> \
+ <<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
+ C, M, HxW, X, scale, bias, Y); \
+ } \
+ template <> \
+ CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
+ const int N, \
+ const int C, \
+ const int HxW, \
+ const T* X, \
+ const T* scale, \
+ const T* bias, \
+ T* Y, \
+ CUDAContext* context) { \
+ const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS); \
+ AffineChannelNHWCCUDAKernel<T> \
+ <<<dim3(N* HxW, M), \
+ CAFFE_CUDA_NUM_THREADS, \
+ 0, \
+ context->cuda_stream()>>>(C, X, scale, bias, Y); \
+ }
+CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
+#undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL
+
+} // namespace math
+} // namespace caffe2
diff --git a/caffe2/utils/math/broadcast.h b/caffe2/utils/math/broadcast.h
new file mode 100644
index 0000000000..67e37d1bd9
--- /dev/null
+++ b/caffe2/utils/math/broadcast.h
@@ -0,0 +1,24 @@
+#ifndef CAFFE2_UTILS_MATH_BROADCAST_H_
+#define CAFFE2_UTILS_MATH_BROADCAST_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/types.h"
+
+namespace caffe2 {
+namespace math {
+
+template <typename T, class Context, StorageOrder kOrder>
+CAFFE2_API void AffineChannel(
+ const int N,
+ const int C,
+ const int HxW,
+ const T* X,
+ const T* scale,
+ const T* bias,
+ T* Y,
+ Context* context);
+
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_BROADCAST_H_
diff --git a/caffe2/utils/math/elementwise.cc b/caffe2/utils/math/elementwise.cc
index ba6fb96338..f392829ccc 100644
--- a/caffe2/utils/math/elementwise.cc
+++ b/caffe2/utils/math/elementwise.cc
@@ -1,5 +1,8 @@
#include "caffe2/utils/math/elementwise.h"
+#include <algorithm>
+#include <functional>
+
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
@@ -12,7 +15,7 @@ namespace math {
////////////////////////////////////////////////////////////////////////////////
// MKL VML alternatives.
-// Depending on whether we are using MKL, we will delegate the Caffe math
+// Depending on whether we are using MKL, we will delegate the Caffe2 math
// functions that are VML-related to either the VML call or the Eigen
// implementation. If you are setting the flags (such as AVX) right for your CPU
// architecture, usually Eigen will deliver a throughput as fast as the VML
@@ -90,6 +93,22 @@ DELEGATE_POWX_FUNCTION(float, vsPowx)
DELEGATE_POWX_FUNCTION(double, vdPowx)
#undef DELEGATE_POWX_FUNCTION
+#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
+ MKLFunc(N, A, B, C); \
+ }
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION
+
#else // CAFFE2_USE_MKL
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
@@ -127,68 +146,85 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, inverse)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, inverse)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
-#define DELEGATE_SINH_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_SINH(T) \
template <> \
C10_EXPORT void Sinh<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / T(2); \
}
-DELEGATE_SINH_FUNCTION(float)
-DELEGATE_SINH_FUNCTION(double)
-#undef DELEGATE_SINH_FUNCTION
+CAFFE2_SPECIALIZED_SINH(float)
+CAFFE2_SPECIALIZED_SINH(double)
+#undef CAFFE2_SPECIALIZED_SINH
-#define DELEGATE_COSH_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_COSH(T) \
template <> \
C10_EXPORT void Cosh<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / T(2); \
}
-DELEGATE_COSH_FUNCTION(float)
-DELEGATE_COSH_FUNCTION(double)
-#undef DELEGATE_COSH_FUNCTION
+CAFFE2_SPECIALIZED_COSH(float)
+CAFFE2_SPECIALIZED_COSH(double)
+#undef CAFFE2_SPECIALIZED_COSH
-#define DELEGATE_SINCOS_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_SINCOS(T) \
template <> \
C10_EXPORT void SinCos<T, CPUContext>( \
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(S, N) = ConstEigenVectorArrayMap<T>(X, N).sin(); \
EigenVectorArrayMap<T>(C, N) = ConstEigenVectorArrayMap<T>(X, N).cos(); \
}
-DELEGATE_SINCOS_FUNCTION(float)
-DELEGATE_SINCOS_FUNCTION(double)
-#undef DELEGATE_SINCOS_FUNCTION
+CAFFE2_SPECIALIZED_SINCOS(float)
+CAFFE2_SPECIALIZED_SINCOS(double)
+#undef CAFFE2_SPECIALIZED_SINCOS
-#define DELEGATE_POWX_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_POWX(T) \
template <> \
C10_EXPORT void Powx<T, CPUContext>( \
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(A, N).pow(b); \
}
-DELEGATE_POWX_FUNCTION(float)
-DELEGATE_POWX_FUNCTION(double)
-#undef DELEGATE_POWX_FUNCTION
+CAFFE2_SPECIALIZED_POWX(float)
+CAFFE2_SPECIALIZED_POWX(double)
+#undef CAFFE2_SPECIALIZED_POWX
-#define DELEGATE_CBRT_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_CBRT(T) \
template <> \
C10_EXPORT void Cbrt<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \
}
-DELEGATE_CBRT_FUNCTION(float)
-DELEGATE_CBRT_FUNCTION(double)
-#undef DELEGATE_CBRT_FUNCTION
+CAFFE2_SPECIALIZED_CBRT(float)
+CAFFE2_SPECIALIZED_CBRT(double)
+#undef CAFFE2_SPECIALIZED_CBRT
-#define DELEGATE_ERF_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_ERF(T) \
template <> \
C10_EXPORT void Erf<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
std::transform(X, X + N, Y, [](const T x) { return erf(x); }); \
}
-DELEGATE_ERF_FUNCTION(float)
-DELEGATE_ERF_FUNCTION(double)
-#undef DELEGATE_ERF_FUNCTION
+CAFFE2_SPECIALIZED_ERF(float)
+CAFFE2_SPECIALIZED_ERF(double)
+#undef CAFFE2_SPECIALIZED_ERF
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
+ EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
+ EigenOp ConstEigenVectorArrayMap<T>(B, N); \
+ }
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Add, +)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Add, +)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Sub, -)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Sub, -)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Mul, *)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Mul, *)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Div, /)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR
#endif // CAFFE2_USE_MKL
@@ -202,74 +238,155 @@ DELEGATE_ERF_FUNCTION(double)
// Eigen's Tanh implementation is faster than MKL, so use Eigen here.
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, tanh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, tanh)
-DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sign, sign)
-DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Sign, sign)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sign, sign)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Abs, abs)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Abs, abs)
-DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cube, cube)
-DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cube, cube)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Cube, cube)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Cube, cube)
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cube, cube)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cube, cube)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
-#define DELEGATE_NEG_FUNCTION(T) \
+#define CAFFE2_SPECIALIZED_NEG(T) \
template <> \
C10_EXPORT void Neg<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = -ConstEigenVectorArrayMap<T>(X, N); \
}
-DELEGATE_NEG_FUNCTION(float)
-DELEGATE_NEG_FUNCTION(double)
-DELEGATE_NEG_FUNCTION(std::int32_t)
-DELEGATE_NEG_FUNCTION(std::int64_t)
-#undef DELEGATE_NEG_FUNCTION
+CAFFE2_SPECIALIZED_NEG(std::int32_t)
+CAFFE2_SPECIALIZED_NEG(std::int64_t)
+CAFFE2_SPECIALIZED_NEG(float)
+CAFFE2_SPECIALIZED_NEG(double)
+#undef CAFFE2_SPECIALIZED_NEG
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
+ EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
+ EigenOp ConstEigenVectorArrayMap<T>(B, N); \
+ }
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Add, +)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Add, +)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Sub, -)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Sub, -)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Mul, *)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Mul, *)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Div, /)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR
-#define CAFFE2_SPECIALIZED_AFFINE_CHANNEL(T) \
- template <> \
- void AffineChannel<T, CPUContext, StorageOrder::NCHW>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- const T* scale, \
- const T* bias, \
- T* Y, \
- CPUContext* /* context */) { \
- ConstEigenVectorArrayMap<T> scale_arr(scale, C); \
- ConstEigenVectorArrayMap<T> bias_arr(bias, C); \
- const int stride = C * HxW; \
- const T* X_ptr = X; \
- T* Y_ptr = Y; \
- for (int i = 0; i < N; ++i) { \
- EigenArrayMap<T>(Y_ptr, HxW, C) = \
- (ConstEigenArrayMap<T>(X_ptr, HxW, C).rowwise() * \
- scale_arr.transpose()) \
- .rowwise() + \
- bias_arr.transpose(); \
- X_ptr += stride; \
- Y_ptr += stride; \
- } \
- } \
- template <> \
- void AffineChannel<T, CPUContext, StorageOrder::NHWC>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- const T* scale, \
- const T* bias, \
- T* Y, \
- CPUContext* /* context */) { \
- EigenArrayMap<T>(Y, C, N * HxW) = \
- (ConstEigenArrayMap<T>(X, C, N * HxW).colwise() * \
- ConstEigenVectorArrayMap<T>(scale, C)) \
- .colwise() + \
- ConstEigenVectorArrayMap<T>(bias, C); \
+#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(T, Func, EigenFunc) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
+ EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \
+ ConstEigenVectorArrayMap<T>(B, N)); \
+ }
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(T, Func, StdFunc) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
+ std::transform(A, A + N, B, C, StdFunc); \
+ }
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ bool,
+ And,
+ std::logical_and<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ bool,
+ Or,
+ std::logical_or<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(bool, Xor, std::bit_xor<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ bool,
+ BitwiseAnd,
+ std::bit_and<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int32_t,
+ BitwiseAnd,
+ std::bit_and<std::int32_t>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int64_t,
+ BitwiseAnd,
+ std::bit_and<std::int64_t>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ bool,
+ BitwiseOr,
+ std::bit_or<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int32_t,
+ BitwiseOr,
+ std::bit_or<std::int32_t>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int64_t,
+ BitwiseOr,
+ std::bit_or<std::int64_t>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ bool,
+ BitwiseXor,
+ std::bit_xor<bool>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int32_t,
+ BitwiseXor,
+ std::bit_xor<std::int32_t>())
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
+ std::int64_t,
+ BitwiseXor,
+ std::bit_xor<std::int64_t>())
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION
+
+#define DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
+ template <> \
+ C10_EXPORT void Func<T, CPUContext>( \
+ const int N, \
+ const T* A, \
+ const T* B, \
+ bool* C, \
+ CPUContext* /* context */) { \
+ EigenVectorArrayMap<bool>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
+ EigenOp ConstEigenVectorArrayMap<T>(B, N); \
}
-CAFFE2_SPECIALIZED_AFFINE_CHANNEL(float)
-#undef CAFFE2_SPECIALIZED_AFFINE_CHANNEL
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, EQ, ==)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, EQ, ==)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, EQ, ==)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, EQ, ==)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, EQ, ==)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, NE, !=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, NE, !=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, NE, !=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, NE, !=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, NE, !=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LT, <)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LT, <)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LT, <)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LT, <)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LT, <)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LE, <=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LE, <=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LE, <=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LE, <=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LE, <=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GT, >)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GT, >)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GT, >)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GT, >)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GT, >)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GE, >=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GE, >=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GE, >=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GE, >=)
+DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GE, >=)
+#undef DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR
} // namespace math
} // namespace caffe2
diff --git a/caffe2/utils/math/elementwise.cu b/caffe2/utils/math/elementwise.cu
index cc7613cda9..b7605deece 100644
--- a/caffe2/utils/math/elementwise.cu
+++ b/caffe2/utils/math/elementwise.cu
@@ -1,13 +1,77 @@
#include "caffe2/utils/math/elementwise.h"
+#include <thrust/functional.h>
+
#include "caffe2/core/context_gpu.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math/half_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
namespace math {
namespace {
+#define DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(T, Func, DeviceFunc) \
+ __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
+ const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
+ if (i < N) { \
+ Y[i] = DeviceFunc(X[i]); \
+ } \
+ }
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Exp, expf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Log, logf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cos, cosf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Acos, acosf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sin, sinf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Asin, asinf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tan, tanf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Atan, atanf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sinh, sinhf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cosh, coshf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tanh, tanhf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Abs, fabsf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqr, utils::Square<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqrt, sqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Rsqrt, rsqrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cbrt, cbrtf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Erf, erff)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Erf, erf)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int32_t,
+ Cube,
+ utils::Cube<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int64_t,
+ Cube,
+ utils::Cube<std::int64_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cube, utils::Cube<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Cube, utils::Cube<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(bool, Not, utils::Not<bool>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int32_t,
+ Neg,
+ utils::Negate<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int64_t,
+ Neg,
+ utils::Negate<std::int64_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Neg, utils::Negate<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Neg, utils::Negate<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int32_t,
+ Sign,
+ utils::Sign<std::int32_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
+ std::int64_t,
+ Sign,
+ utils::Sign<std::int64_t>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sign, utils::Sign<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Sign, utils::Sign<double>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Inv, utils::Inv<float>)
+DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Inv, utils::Inv<double>)
+#undef DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION
+
template <typename T>
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
@@ -20,190 +84,276 @@ __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
}
}
-template <typename T>
-__global__ void AffineChannelNCHWCUDAKernel(
- const int C,
- const int M,
- const int HxW,
- const T* X,
- const T* scale,
- const T* bias,
- T* Y);
-
-template <>
-__global__ void AffineChannelNCHWCUDAKernel<float>(
- const int C,
- const int M,
- const int HxW,
- const float* X,
- const float* scale,
- const float* bias,
- float* Y) {
- const int nc = blockIdx.x / M;
- const int c = nc % C;
- const int w = blockIdx.x % M * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
- if (w < HxW) {
- const int index = nc * HxW + w;
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
- Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
-#else
- Y[index] = fmaf(X[index], scale[c], bias[c]);
-#endif
+template <typename T, class Func>
+__global__ void SimpleBinaryCUDAKernel(
+ const int N,
+ const Func func,
+ const T* A,
+ const T* B,
+ T* C) {
+ const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (i < N) {
+ C[i] = func(A[i], B[i]);
}
}
-template <typename T>
-__global__ void AffineChannelNHWCCUDAKernel(
- const int C,
- const T* X,
- const T* scale,
- const T* bias,
- T* Y);
-
-template <>
-__global__ void AffineChannelNHWCCUDAKernel<float>(
- const int C,
- const float* X,
- const float* scale,
- const float* bias,
- float* Y) {
- const int c = blockIdx.y * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
- if (c < C) {
- const int index = blockIdx.x * C + c;
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
- Y[index] = fmaf(__ldg(X + index), __ldg(scale + c), __ldg(bias + c));
-#else
- Y[index] = fmaf(X[index], scale[c], bias[c]);
-#endif
+template <typename T, class Comp>
+__global__ void SimpleCompareCUDAKernel(
+ const int N,
+ const Comp comp,
+ const T* A,
+ const T* B,
+ bool* C) {
+ const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
+ if (i < N) {
+ C[i] = comp(A[i], B[i]);
}
}
} // namespace
-#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, KernelFunc) \
- __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
- const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
- if (i < N) { \
- Y[i] = KernelFunc(X[i]); \
- } \
- } \
- template <> \
- CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
- const int N, const T* X, T* Y, CUDAContext* context) { \
- if (N > 0) { \
- const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
- Func##CUDAKernel<<< \
- K, \
- CAFFE_CUDA_NUM_THREADS, \
- 0, \
- context->cuda_stream()>>>(N, X, Y); \
- } \
+#define DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
+ const int N, const T* X, T* Y, CUDAContext* context) { \
+ if (N > 0) { \
+ const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
+ Func##CUDAKernel<<< \
+ M, \
+ CAFFE_CUDA_NUM_THREADS, \
+ 0, \
+ context->cuda_stream()>>>(N, X, Y); \
+ } \
}
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh, sinhf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh, coshf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh, tanhf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt, cbrtf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf, erff)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf, erf)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Cube)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Cube)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv)
+DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv)
+#undef DEFINE_SIMPLE_CUDA_UNARY_FUNCTION
+
+#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>( \
+ const int N, const T* X, T* S, T* C, CUDAContext* context) { \
+ if (N > 0) { \
+ const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
+ SinCosCUDAKernel<T> \
+ <<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
+ N, X, S, C); \
+ } \
+ }
+CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
+CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
+#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
+
+#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
+ const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
+ if (N > 0) { \
+ const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
+ SimpleBinaryCUDAKernel<<< \
+ M, \
+ CAFFE_CUDA_NUM_THREADS, \
+ 0, \
+ context->cuda_stream()>>>(N, DeviceFunc, A, B, C); \
+ } \
+ }
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int32_t,
- Cube,
- utils::Cube<std::int32_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+ Add,
+ thrust::plus<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int64_t,
- Cube,
- utils::Cube<std::int64_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, utils::Not)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+ Add,
+ thrust::plus<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int32_t,
- Neg,
- utils::Negate<std::int32_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+ Sub,
+ thrust::minus<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int64_t,
- Neg,
- utils::Negate<std::int64_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+ Sub,
+ thrust::minus<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int32_t,
- Sign,
- utils::Sign<std::int32_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
+ Mul,
+ thrust::multiplies<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
std::int64_t,
- Sign,
- utils::Sign<std::int64_t>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv, utils::Inv<float>)
-DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv, utils::Inv<double>)
-#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
-
-#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>( \
- const int N, const T* X, T* S, T* C, CUDAContext* context) { \
- if (N > 0) { \
- const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
- SinCosCUDAKernel<<< \
- K, \
- CAFFE_CUDA_NUM_THREADS, \
- 0, \
- context->cuda_stream()>>>(N, X, S, C); \
- } \
- }
-CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
-CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
-#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
+ Mul,
+ thrust::multiplies<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int32_t,
+ Div,
+ thrust::divides<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int64_t,
+ Div,
+ thrust::divides<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int32_t,
+ BitwiseAnd,
+ thrust::bit_and<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int64_t,
+ BitwiseAnd,
+ thrust::bit_and<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int32_t,
+ BitwiseOr,
+ thrust::bit_or<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int64_t,
+ BitwiseOr,
+ thrust::bit_or<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int32_t,
+ BitwiseXor,
+ thrust::bit_xor<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
+ std::int64_t,
+ BitwiseXor,
+ thrust::bit_xor<std::int64_t>())
+#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
-#define CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(T) \
- template <> \
- CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NCHW>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- const T* scale, \
- const T* bias, \
- T* Y, \
- CUDAContext* context) { \
- const int M = DivUp(HxW, CAFFE_CUDA_NUM_THREADS); \
- AffineChannelNCHWCUDAKernel<T> \
- <<<N * C * M, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
- C, M, HxW, X, scale, bias, Y); \
- } \
- template <> \
- CAFFE2_CUDA_EXPORT void AffineChannel<T, CUDAContext, StorageOrder::NHWC>( \
- const int N, \
- const int C, \
- const int HxW, \
- const T* X, \
- const T* scale, \
- const T* bias, \
- T* Y, \
- CUDAContext* context) { \
- const int M = DivUp(C, CAFFE_CUDA_NUM_THREADS); \
- AffineChannelNHWCCUDAKernel<T> \
- <<<dim3(N* HxW, M), \
- CAFFE_CUDA_NUM_THREADS, \
- 0, \
- context->cuda_stream()>>>(C, X, scale, bias, Y); \
+#define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp) \
+ template <> \
+ CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
+ const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
+ if (N > 0) { \
+ const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
+ SimpleCompareCUDAKernel<<< \
+ M, \
+ CAFFE_CUDA_NUM_THREADS, \
+ 0, \
+ context->cuda_stream()>>>(N, DeviceComp, A, B, C); \
+ } \
}
-CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL(float)
-#undef CAFFE2_SPECIALIZED_CUDA_AFFINE_CHANNEL
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ EQ,
+ thrust::equal_to<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ EQ,
+ thrust::equal_to<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ NE,
+ thrust::not_equal_to<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ NE,
+ thrust::not_equal_to<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ double,
+ NE,
+ thrust::not_equal_to<double>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ LT,
+ thrust::less<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ LT,
+ thrust::less<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ LE,
+ thrust::less_equal<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ LE,
+ thrust::less_equal<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ GT,
+ thrust::greater<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ GT,
+ thrust::greater<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int32_t,
+ GE,
+ thrust::greater_equal<std::int32_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ std::int64_t,
+ GE,
+ thrust::greater_equal<std::int64_t>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
+DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
+ double,
+ GE,
+ thrust::greater_equal<double>())
+#undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION
} // namespace math
} // namespace caffe2
diff --git a/caffe2/utils/math/elementwise.h b/caffe2/utils/math/elementwise.h
index 63890de498..b11d22b955 100644
--- a/caffe2/utils/math/elementwise.h
+++ b/caffe2/utils/math/elementwise.h
@@ -8,64 +8,97 @@ namespace caffe2 {
namespace math {
template <typename T, class Context>
-void Exp(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Exp(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Log(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Log(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Sin(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Sin(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Asin(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Asin(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Cos(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Cos(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Acos(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Acos(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Tan(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Tan(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Atan(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Atan(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Sinh(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Sinh(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Cosh(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Cosh(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void SinCos(const int N, const T* X, T* S, T* C, Context* context);
+CAFFE2_API void SinCos(int N, const T* X, T* S, T* C, Context* context);
template <typename T, class Context>
-void Tanh(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Tanh(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Abs(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Abs(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Sqr(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Sqr(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Sqrt(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Sqrt(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Rsqrt(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Rsqrt(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Cube(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Cube(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Cbrt(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Cbrt(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Neg(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Neg(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Sign(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Sign(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Not(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Not(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Powx(const int N, const T* A, const T b, T* Y, Context* context);
+CAFFE2_API void Powx(int N, const T* A, const T b, T* Y, Context* context);
template <typename T, class Context>
-void Inv(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Inv(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
-void Erf(const int N, const T* X, T* Y, Context* context);
+CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);
-template <typename T, class Context, StorageOrder kOrder>
-CAFFE2_API void AffineChannel(
- const int N,
- const int C,
- const int HxW,
- const T* X,
- const T* scale,
- const T* bias,
- T* Y,
- Context* context);
+template <typename T, class Context>
+CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Sub(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Mul(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Div(int N, const T* A, const T* B, T* C, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void Min(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Max(int N, const T* A, const T* B, T* C, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void And(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Or(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void Xor(int N, const T* A, const T* B, T* C, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+BitwiseAnd(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void
+BitwiseOr(int N, const T* A, const T* B, T* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void
+BitwiseXor(int N, const T* A, const T* B, T* C, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void EQ(int N, const T* A, const T* B, bool* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void NE(int N, const T* A, const T* B, bool* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void LT(int N, const T* A, const T* B, bool* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void LE(int N, const T* A, const T* B, bool* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void GT(int N, const T* A, const T* B, bool* C, Context* context);
+template <typename T, class Context>
+CAFFE2_API void GE(int N, const T* A, const T* B, bool* C, Context* context);
} // namespace math
} // namespace caffe2
diff --git a/caffe2/utils/math/half_utils.h b/caffe2/utils/math/half_utils.h
new file mode 100644
index 0000000000..ac841d165a
--- /dev/null
+++ b/caffe2/utils/math/half_utils.h
@@ -0,0 +1,49 @@
+#ifndef CAFFE2_UTILS_MATH_HALF_UTILS_H_
+#define CAFFE2_UTILS_MATH_HALF_UTILS_H_
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/types.h"
+#include "caffe2/utils/conversions.h"
+#include "caffe2/utils/math/utils.h"
+
+namespace caffe2 {
+namespace math {
+namespace utils {
+
+struct HalfAddFunctor {
+ MATH_UTILS_DECL at::Half operator()(const at::Half a, const at::Half b)
+ const {
+ return convert::To<float, at::Half>(
+ convert::To<at::Half, float>(a) + convert::To<at::Half, float>(b));
+ }
+};
+
+struct HalfSubFunctor {
+ MATH_UTILS_DECL at::Half operator()(const at::Half a, const at::Half b)
+ const {
+ return convert::To<float, at::Half>(
+ convert::To<at::Half, float>(a) - convert::To<at::Half, float>(b));
+ }
+};
+
+struct HalfMulFunctor {
+ MATH_UTILS_DECL at::Half operator()(const at::Half a, const at::Half b)
+ const {
+ return convert::To<float, at::Half>(
+ convert::To<at::Half, float>(a) * convert::To<at::Half, float>(b));
+ }
+};
+
+struct HalfDivFunctor {
+ MATH_UTILS_DECL at::Half operator()(const at::Half a, const at::Half b)
+ const {
+ return convert::To<float, at::Half>(
+ convert::To<at::Half, float>(a) / convert::To<at::Half, float>(b));
+ }
+};
+
+} // namespace utils
+} // namespace math
+} // namespace caffe2
+
+#endif // CAFFE2_UTILS_MATH_HALF_UTILS_H_
diff --git a/caffe2/utils/math/reduce.cc b/caffe2/utils/math/reduce.cc
index c654834eee..4bcbb1cda3 100644
--- a/caffe2/utils/math/reduce.cc
+++ b/caffe2/utils/math/reduce.cc
@@ -8,7 +8,7 @@
#include "caffe2/core/context.h"
#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
namespace math {
diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
index 31a653930e..18291d35f9 100644
--- a/caffe2/utils/math/reduce.cu
+++ b/caffe2/utils/math/reduce.cu
@@ -11,7 +11,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/math/reduce.cuh"
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
namespace caffe2 {
namespace math {
diff --git a/caffe2/utils/math_utils.cc b/caffe2/utils/math/utils.cc
index f4f7da99f3..3b75cedaaa 100644
--- a/caffe2/utils/math_utils.cc
+++ b/caffe2/utils/math/utils.cc
@@ -1,4 +1,4 @@
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
#include <algorithm>
#include <functional>
diff --git a/caffe2/utils/math_utils.h b/caffe2/utils/math/utils.h
index b3fdb14884..b704adb188 100644
--- a/caffe2/utils/math_utils.h
+++ b/caffe2/utils/math/utils.h
@@ -3,9 +3,8 @@
#include "caffe2/core/common.h"
-// See Note [hip-clang differences to hcc]
-
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__HIP__)
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || \
+ defined(__HIP__)
#define MATH_UTILS_DECL inline __host__ __device__
#else
#define MATH_UTILS_DECL inline
@@ -16,7 +15,8 @@ namespace math {
namespace utils {
-MATH_UTILS_DECL bool Not(const bool x) {
+template <typename T>
+MATH_UTILS_DECL T Not(const T x) {
return !x;
}
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index 0adb9239a3..3bbc1ac4ed 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -623,66 +623,6 @@ C10_EXPORT void GemmStridedBatched<float, CPUContext>(
}
////////////////////////////////////////////////////////////////////////////////
-// MKL VML alternatives.
-// Depending on whether we are using MKL, we will delegate the Caffe math
-// functions that are VML-related to either the VML call or the Eigen
-// implementation. If you are setting the flags (such as AVX) right for your CPU
-// architecture, usually Eigen will deliver a throughput as fast as the VML
-// functions.
-////////////////////////////////////////////////////////////////////////////////
-#ifdef CAFFE2_USE_MKL
-
-#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl) \
- template <> \
- C10_EXPORT void Func<T, CPUContext>( \
- const int N, const T* A, const T* B, T* C, CPUContext*) { \
- FuncImpl(N, A, B, C); \
- }
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
-#undef DELEGATE_SIMPLE_BINARY_FUNCTION
-
-#endif // CAFFE2_USE_MKL
-
-#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr) \
- template <> \
- C10_EXPORT void Func<T, CPUContext>( \
- const int N, const T* A, const T* B, T* C, CPUContext*) { \
- EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
- expr ConstEigenVectorArrayMap<T>(B, N); \
- }
-
-#ifdef CAFFE2_USE_MKL
-
-#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
-
-#else
-
-#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
- EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
-
-#endif
-
-DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
-DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
-DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
-DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
-
-#undef DEFINE_SIMPLE_BINARY_FUNCTION
-#undef EIGEN_SIMPLE_BINARY_FUNCTION
-
-////////////////////////////////////////////////////////////////////////////////
// Common math functions being used in Caffe that do not have a BLAS or MKL
// equivalent. For all these functions, we will simply implement them either via
// Eigen or via custom code.
@@ -1332,17 +1272,6 @@ CAFFE2_SPECIALIZED_ROWWISEMAX(float)
CAFFE2_SPECIALIZED_COLWISEMAX(float)
#undef CAFFE2_SPECIALIZED_COLWISEMAX
-#define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \
- template <> \
- C10_EXPORT void ElemwiseMax<T, CPUContext>( \
- const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
- std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \
- return std::max(x_i, y_i); \
- }); \
- }
-CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
-#undef CAFFE2_SPECIALIZED_ELEMWISEMAX
-
#define CAFFE2_SPECIALIZED_MAXIMUM(T) \
template <> \
C10_EXPORT void Maximum<T, CPUContext>( \
@@ -1609,46 +1538,6 @@ C10_EXPORT void BroadcastBinaryOpImpl(
} // namespace
-#define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op) \
- template <> \
- C10_EXPORT void Func<TIn, CPUContext>( \
- const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
- std::transform(A, A + N, B, C, Op<TIn>()); \
- }
-
-#define DEFINE_1D_COMPARE_FUNCTION(Func, Op) \
- DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op) \
- DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op) \
- DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
- DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
- DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op)
-
-DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to)
-DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to)
-DEFINE_1D_COMPARE_FUNCTION(LT, std::less)
-DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal)
-DEFINE_1D_COMPARE_FUNCTION(GT, std::greater)
-DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal)
-
-#undef DEFINE_1D_COMPARE_FUNCTION
-
-DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and)
-DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
-DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
-
-#define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op) \
- DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op) \
- DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \
- DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op)
-
-DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
-DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
-DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
-
-#undef DEFINE_1D_BITWISE_BINARY_FUNCTION
-
-#undef DELEGATE_1D_BINARY_FUNCTION
-
#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index e93e6924a9..f8ccda7053 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -41,7 +41,7 @@ using CUBLAS_HALF_TYPE = rocblas_half;
using CUBLAS_HALF_TYPE = __half;
#endif // __HIP_PLATFORM_HCC
-#include "caffe2/utils/math_utils.h"
+#include "caffe2/utils/math/utils.h"
#if THRUST_VERSION >= 100800
#define THRUST_SUPPORTS_PER_THREAD
@@ -302,74 +302,6 @@ CAFFE2_CUDA_EXPORT void BroadcastBinaryOp(
} // namespace
-#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
- template <> \
- CAFFE2_CUDA_EXPORT void Func<TIn, CUDAContext>( \
- const int N, \
- const TIn* A, \
- const TIn* B, \
- TOut* C, \
- CUDAContext* context) { \
- SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>> \
- <<<CAFFE_GET_BLOCKS(N), \
- CAFFE_CUDA_NUM_THREADS, \
- 0, \
- context->cuda_stream()>>>(N, Op<TIn>(), A, B, C); \
- }
-
-#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
-
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
-DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
-
-#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION
-
-#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, at::Half, Func, Op)
-
-DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
-DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
-DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
-DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)
-
-#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION
-
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
-
-#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
- DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
-
-DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
-DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
-DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
-
-#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION
-
-DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
- float,
- float,
- ElemwiseMax,
- thrust::maximum);
-
-#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
-
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
CAFFE2_CUDA_EXPORT void Rowwise##Func<TIn, CUDAContext, true>( \
diff --git a/caffe2/utils/math_gpu_test.cc b/caffe2/utils/math_gpu_test.cc
index 80d7ff0291..99e9f69e33 100644
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@@ -204,24 +204,6 @@ TEST(MathUtilGPUTest, testReduceMax) {
[](int /*i*/) { return 17.0f; });
}
-TEST(MathUtilGPUTest, testElemwiseMax) {
- executeGpuBinaryOpTest(
- 13,
- 13,
- 13,
- [](int i) { return 2.0f - i; },
- [](int i) { return i - 6.0f; },
- [](int N0,
- int /*N1*/,
- const float* src0,
- const float* src1,
- float* dst,
- CUDAContext* context) {
- math::ElemwiseMax<float, CUDAContext>(N0, src0, src1, dst, context);
- },
- [](int i) { return std::max(2.0f - i, i - 6.0f); });
-}
-
TEST(MathUtilGPUTest, testCopyVector) {
executeGpuBinaryOpTest(
6,