#define CUB_STDERR #include #include #include #include "caffe2/core/common_gpu.h" #include "caffe2/core/context_gpu.h" #include "caffe2/operators/pow_op.h" #include "caffe2/utils/conversions.h" namespace caffe2 { // pow, log and other math functions are defined in // CUDA math library in header file math.h #define CUDA_POW(x, y) (pow(x, y)) // renaming to PowOpKernel as there exists PowKernel in caffe2/utils/math_gpu.cc // Kernels with same leads to conflict during hipification for ROCm platform. template __global__ void PowOpKernel(const T1* a, const T2* b, T2 e, R* out, int n) { CUDA_1D_KERNEL_LOOP(i, n) { out[i] = CUDA_POW(a[i], ((b == NULL) ? e : b[b_is_scalar ? 0 : i])); } } template __global__ void PowBroadcastKernel(const T1* a, const T2* b, R* out, int pre, int n) { CUDA_1D_KERNEL_LOOP(i, pre * n) { out[i] = CUDA_POW(a[i], b[i % n]); } } template __global__ void PowBroadcast2Kernel( const T1* a, const T2* b, R* out, int pre, int n, int post) { CUDA_1D_KERNEL_LOOP(i, pre * n * post) { out[i] = CUDA_POW(a[i], b[(i / post) % n]); } } struct CudaPowFunctor { template inline void Run(size_t n, const T1* a, const T2* b, T2 e, R* out, CUDAContext* context) { PowOpKernel <<cuda_stream()>>>(a, b, e, out, n); } template void RunWithBroadcast( const T1* a, const T2* b, R* out, size_t pre, size_t n, CUDAContext* context) { PowBroadcastKernel <<cuda_stream()>>>(a, b, out, pre, n); } template void RunWithBroadcast2( const T1* a, const T2* b, R* out, size_t pre, size_t n, size_t post, CUDAContext* context) { PowBroadcast2Kernel <<cuda_stream()>>>(a, b, out, pre, n, post); } }; REGISTER_CUDA_OPERATOR( Pow, PowOp< TensorTypes /*NumericTypes*/, CUDAContext, CudaPowFunctor, SameTypeAsInput>) } // namespace caffe2