#include #include "caffe2/core/context_gpu.h" #include "caffe2/operators/distance_op.h" #include "caffe2/utils/conversions.h" #include namespace caffe2 { namespace { template __global__ void SquaredL2DistanceKernel( const int N, const int D, const T* X, const T* Y, T* distance) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < N; i += gridDim.x) { float dist = 0.0; for (int j = threadIdx.x; j < D; j += blockDim.x) { T diff = X[i * D + j] - Y[i * D + j]; dist += diff * diff; } float total_dist = BlockReduce(temp_storage).Sum(dist); __syncthreads(); if (threadIdx.x == 0) { distance[i] = total_dist / 2.0; } } } } // namespace template <> bool SquaredL2DistanceOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); CAFFE_ENFORCE_EQ(X.dim(), Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ( X.dim32(i), Y.dim32(i), "Mismatch in dimensions", X.sizes(), " / ", Y.sizes()); } int N = X.dim() > 0 ? X.dim32(0) : 1; int D = X.size() / N; auto* distance = Output(0, vector(size_t(1), N), at::dtype()); SquaredL2DistanceKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( N, D, X.data(), Y.data(), distance->template mutable_data()); return true; } namespace { template __global__ void StripedScaleKernel(const int N, const int D, const T* alpha, const T* x, T* y) { CUDA_1D_KERNEL_LOOP(i, N * D) { int k = i / D; y[i] = x[i] * alpha[k]; } } } template <> bool SquaredL2DistanceGradientOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); auto& dDistance = Input(2); int N = X.dim() > 0 ? X.dim32(0) : 1; int D = N > 0 ? X.size() / N : 0; CAFFE_ENFORCE(X.dim() == Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ( X.dim32(i), Y.dim32(i), "Mismatch on dimensions: ", X.sizes(), " / ", Y.sizes()); } CAFFE_ENFORCE_EQ(dDistance.dim(), 1); CAFFE_ENFORCE_EQ(dDistance.dim32(0), N); auto* dX = Output(0, X.sizes(), at::dtype()); auto* dY = Output(1, Y.sizes(), at::dtype()); math::Sub( X.size(), X.data(), Y.data(), dX->template mutable_data(), &context_); StripedScaleKernel <<>>( N, D, dDistance.data(), dX->data(), dX->template mutable_data()); // The gradient of the other side is basically the negative. math::Scale( X.size(), -1, dX->data(), dY->template mutable_data(), &context_); return true; } namespace { template __global__ void L1DistanceKernel( const int N, const int D, const T* X, const T* Y, T* distance) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < N; i += gridDim.x) { float sum = 0.0f; for (int j = threadIdx.x; j < D; j += blockDim.x) { sum += fabsf( convert::To(X[i * D + j]) - convert::To(Y[i * D + j])); } float aggregate = BlockReduce(temp_storage).Sum(sum); __syncthreads(); if (threadIdx.x == 0) { distance[i] = aggregate; } } } } // namespace template <> bool L1DistanceOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); CAFFE_ENFORCE_EQ(X.dim(), Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i)); } const int N = X.dim() > 0 ? X.dim32(0) : 1; const int D = N > 0 ? X.size() / N : 0; auto* distance = Output(0, vector(size_t(1), N), at::dtype()); L1DistanceKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( N, D, X.data(), Y.data(), distance->template mutable_data()); return true; } namespace { template __global__ void L1DistanceGradientKernel( const int N, const int D, const T* X, const T* Y, const T* dDistance, T* dX, T* dY) { CUDA_1D_KERNEL_LOOP(i, N * D) { constexpr float kEps = 1e-12; int k = i / D; if (X[i] - Y[i] < -kEps) { dX[i] = -dDistance[k]; dY[i] = dDistance[k]; } else if (X[i] - Y[i] > kEps) { dX[i] = dDistance[k]; dY[i] = -dDistance[k]; } else { dX[i] = 0; dY[i] = 0; } } } } // namespace template <> bool L1DistanceGradientOp::RunOnDevice() { auto& X = Input(0); auto& Y = Input(1); auto& dDistance = Input(2); int N = X.dim() > 0 ? X.dim32(0) : 1; int D = N > 0 ? X.size() / N : 0; CAFFE_ENFORCE(X.dim() == Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ( X.dim32(i), Y.dim32(i), "Mismatch on dimensions: ", X.sizes(), " / ", Y.sizes()); } CAFFE_ENFORCE_EQ(dDistance.dim(), 1); CAFFE_ENFORCE_EQ(dDistance.dim32(0), N); auto* dX = Output(0, X.sizes(), at::dtype()); auto* dY = Output(1, Y.sizes(), at::dtype()); L1DistanceGradientKernel<<< CAFFE_GET_BLOCKS(N * D), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( N, D, X.data(), Y.data(), dDistance.data(), dX->template mutable_data(), dY->template mutable_data()); return true; } namespace { template __global__ void DotProductKernel(const int N, const int D, const T* X, const T* Y, T* result) { for (int i = blockIdx.x; i < N; i += gridDim.x) { T partialSum = 0; int offset = i * D; for (int j = threadIdx.x; j < D; j += blockDim.x) { partialSum += X[offset + j] * Y[offset + j]; } typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; T sum = BlockReduce(temp_storage).Sum(partialSum); __syncthreads(); if (threadIdx.x == 0) { result[i] = sum; } } } // X.size() = N*D, Y.size() = N template __global__ void BatchedMul(const int N, const int D, const T* X, const T* Y, T* result) { CUDA_1D_KERNEL_LOOP(i, N * D) { result[i] = X[i] * Y[i / D]; } } // X.size() = N*D, Y.size() = N template __global__ void Scale2AxpyScale( const int N, const T* scale, const T* XY, const T* XN, T* result) { CUDA_1D_KERNEL_LOOP(i, N) { result[i] = -scale[i] * XY[i] / (XN[i] * XN[i]); } } // X.size() = X*N, alpha.size() = N, Y.size() = X*N template __global__ void BatchedAxpy(const int N, const int D, const T* alpha, const T* X, T* Y) { CUDA_1D_KERNEL_LOOP(i, N * D) { Y[i] += X[i] * alpha[i / D]; } } } // namespace template <> bool CosineSimilarityOp::RunOnDevice() { auto& X = Input(X_IN); auto& Y = Input(Y_IN); CAFFE_ENFORCE_EQ(X.dim(), Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i)); } const int N = X.dim() > 0 ? X.dim32(0) : 1; const int D = X.size_from_dim(1); auto* result = Output(COS_OUT, {N}, at::dtype()); float* result_data = result->template mutable_data(); const float* X_data = X.data(); const float* Y_data = Y.data(); // Auxiliary arrays, one allocation of memory ReinitializeTensor(&aux_, {2 * N}, at::dtype().device(CUDA)); float* aux_data = aux_.mutable_data(); float* x2 = aux_data; float* y2 = aux_data + N; float* scale = x2; const float kEps = 1e-12f; DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, X_data, x2); DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, Y_data, Y_data, y2); DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, Y_data, result_data); math::Maximum(N, kEps, x2, x2, &context_); math::Maximum(N, kEps, y2, y2, &context_); math::Mul(N, x2, y2, scale, &context_); math::Rsqrt(N, scale, scale, &context_); math::Mul(N, result_data, scale, result_data, &context_); return true; } template <> bool CosineSimilarityGradientOp::RunOnDevice() { auto& X = Input(X_IN); auto& Y = Input(Y_IN); auto& dCos = Input(DER_COS_IN); const int N = X.dim() > 0 ? X.dim32(0) : 1; const int D = X.size_from_dim(1); CAFFE_ENFORCE(X.dim() == Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i)); } CAFFE_ENFORCE(dCos.dim() == 1); CAFFE_ENFORCE(dCos.dim32(0) == N); auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype()); auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype()); const auto* X_data = X.data(); const auto* Y_data = Y.data(); const auto* dCos_data = dCos.data(); auto* dX_data = dX->template mutable_data(); auto* dY_data = dY->template mutable_data(); // one memory allocation, a few arrays ReinitializeTensor(&aux_, {6 * N}, at::dtype().device(CUDA)); float* aux_data = aux_.mutable_data(); float* xn = aux_data; float* yn = aux_data + N; float* xy = aux_data + 2 * N; float* xyn = aux_data + 3 * N; float* scale = aux_data + 4 * N; float* axpy_scale = aux_data + 5 * N; float kEps = 1e-12f; // ||x|| DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, X_data, xn); math::Maximum(N, kEps, xn, xn, &context_); math::Sqrt(N, xn, xn, &context_); // ||y|| DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, Y_data, Y_data, yn); math::Maximum(N, kEps, yn, yn, &context_); math::Sqrt(N, yn, yn, &context_); // ||x|| * || y || math::Mul(N, xn, yn, xyn, &context_); DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, Y_data, xy); math::Div(N, dCos_data, xyn, scale, &context_); // dX BatchedMul<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, Y_data, scale, dX_data); Scale2AxpyScale<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, scale, xy, xn, axpy_scale); BatchedAxpy<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, axpy_scale, X_data, dX_data); // dY BatchedMul<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, X_data, scale, dY_data); Scale2AxpyScale<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, scale, xy, yn, axpy_scale); BatchedAxpy<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(N, D, axpy_scale, Y_data, dY_data); return true; } template <> bool DotProductOp::RunOnDevice() { auto& X = Input(X_IN); auto& Y = Input(Y_IN); CAFFE_ENFORCE_EQ(X.dim(), Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE_EQ(X.dim32(i), Y.dim32(i)); } int N, D; if (X.size() > 0) { N = X.dim() > 0 ? X.dim32(0) : 1; D = X.size() / N; } else { N = 0; D = 0; } auto* result = Output(DOT_OUT, {N}, at::dtype()); DotProductKernel<<< std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( N, D, X.data(), Y.data(), result->template mutable_data()); return true; } namespace { template __global__ void DotProductGradientKernel( const int N, const int D, const T* X, const T* Y, const T* dDot, T* dX, T* dY) { CUDA_1D_KERNEL_LOOP(i, N * D) { T scale = dDot[i / D]; dX[i] = Y[i] * scale; dY[i] = X[i] * scale; } } } // namespace template <> bool DotProductGradientOp::RunOnDevice() { auto& X = Input(X_IN); auto& Y = Input(Y_IN); auto& dDot = Input(DER_DOT_IN); int N, D; if (X.size() > 0) { N = X.dim() > 0 ? X.dim32(0) : 1; D = X.size() / N; } else { N = 0; D = 0; } CAFFE_ENFORCE(X.dim() == Y.dim()); for (int i = 0; i < X.dim(); ++i) { CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i)); } CAFFE_ENFORCE(dDot.dim() == 1); CAFFE_ENFORCE(dDot.dim32(0) == N); auto* dX = Output(DER_X_OUT, X.sizes(), at::dtype()); auto* dY = Output(DER_Y_OUT, Y.sizes(), at::dtype()); DotProductGradientKernel<<< CAFFE_GET_BLOCKS(N * D), CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( N, D, X.data(), Y.data(), dDot.data(), dX->template mutable_data(), dY->template mutable_data()); return true; } REGISTER_CUDA_OPERATOR(SquaredL2Distance, SquaredL2DistanceOp); REGISTER_CUDA_OPERATOR(SquaredL2DistanceGradient, SquaredL2DistanceGradientOp); REGISTER_CUDA_OPERATOR(L1Distance, L1DistanceOp); REGISTER_CUDA_OPERATOR( L1DistanceGradient, L1DistanceGradientOp); REGISTER_CUDA_OPERATOR(DotProduct, DotProductOp); REGISTER_CUDA_OPERATOR( DotProductGradient, DotProductGradientOp); REGISTER_CUDA_OPERATOR( CosineSimilarity, CosineSimilarityOp); REGISTER_CUDA_OPERATOR( CosineSimilarityGradient, CosineSimilarityGradientOp); } // namespace caffe2