#include "caffe2/operators/tan_op.h" #include #include #include "caffe2/core/context_gpu.h" namespace caffe2 { template inline __host__ __device__ T Square(const T& x) { return x * x; } template __global__ void TanGradientCUDAKernel(const int N, const T* dY, const T* X, T* dX) { CUDA_1D_KERNEL_LOOP(i, N) { #if __CUDA_ARCH__ >= 350 dX[i] = __ldg(dY + i) / Square(cos(__ldg(X + i))); #else dX[i] = dY[i] / Square(cos(X[i])); #endif } } template <> template bool TanGradientFunctor::Forward( const std::vector& X_dims, const std::vector& /* dY_dims */, const T* X, const T* dY, T* dX, CUDAContext* context) const { const int size = std::accumulate( X_dims.cbegin(), X_dims.cend(), 1, std::multiplies()); TanGradientCUDAKernel <<cuda_stream()>>>(size, dY, X, dX); return true; } REGISTER_CUDA_OPERATOR( Tan, UnaryElementwiseOp< TensorTypes, CUDAContext, TanFunctor>); REGISTER_CUDA_OPERATOR( TanGradient, BinaryElementwiseOp< TensorTypes, CUDAContext, TanGradientFunctor>); } // namespace caffe2