diff options
author | Xiaomeng Yang <yangxm@fb.com> | 2019-04-04 11:46:37 -0700 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-04-04 11:52:06 -0700 |
commit | b145dcca048a28b1184be0d90f18aa29d23f0953 (patch) | |
tree | 1a735170ce5ba9373151266229f1c5ecc225c01a /caffe2 | |
parent | 8732a1b42ea46fc251369bab4f29332d5e35959e (diff) | |
download | pytorch-b145dcca048a28b1184be0d90f18aa29d23f0953.tar.gz pytorch-b145dcca048a28b1184be0d90f18aa29d23f0953.tar.bz2 pytorch-b145dcca048a28b1184be0d90f18aa29d23f0953.zip |
Add support for group ConvTranspose (#18794)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18794
Add support for group ConvTranspose
Reviewed By: houseroad
Differential Revision: D14741327
fbshipit-source-id: 5d947ca044bf8495dd7f8f56122441ebbcc6c7e4
Diffstat (limited to 'caffe2')
-rw-r--r-- | caffe2/operators/conv_transpose_op_cudnn.cc | 261 | ||||
-rw-r--r-- | caffe2/operators/conv_transpose_op_impl.h | 889 | ||||
-rw-r--r-- | caffe2/operators/conv_transpose_unpool_op_base.h | 38 | ||||
-rw-r--r-- | caffe2/python/operator_test/conv_transpose_test.py | 63 |
4 files changed, 765 insertions, 486 deletions
diff --git a/caffe2/operators/conv_transpose_op_cudnn.cc b/caffe2/operators/conv_transpose_op_cudnn.cc index 8f8c9a2db8..459ccd7661 100644 --- a/caffe2/operators/conv_transpose_op_cudnn.cc +++ b/caffe2/operators/conv_transpose_op_cudnn.cc @@ -1,7 +1,10 @@ +#include "caffe2/operators/conv_transpose_op.h" + +#include <vector> + #include "caffe2/core/context_gpu.h" #include "caffe2/core/cudnn_wrappers.h" #include "caffe2/operators/conv_op_cache_cudnn.h" -#include "caffe2/operators/conv_transpose_op.h" #include "caffe2/operators/op_utils_cudnn.h" namespace caffe2 { @@ -49,6 +52,7 @@ class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> { CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_)); if (InputSize() == 3) { CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_)); } CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_)); CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_)); @@ -59,27 +63,59 @@ class CudnnConvTransposeOpBase : public ConvTransposeUnpoolBase<CUDAContext> { CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_)); if (InputSize() == 3) { CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_)); } CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_)); CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_)); } protected: - vector<int64_t> cudnn_input_dims_; - vector<int64_t> cudnn_filter_dims_; + void SetTensor4DDescriptorWithGroup( + const cudnnDataType_t data_type, + const int N, + const int C, + const int H, + const int W, + cudnnTensorDescriptor_t* desc) const { +#if CUDNN_VERSION_MIN(7, 0, 0) + const int CC = C; +#else + const int CC = C / group_; +#endif + switch (order_) { + case StorageOrder::NCHW: { + CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx( + *desc, data_type, N, CC, H, W, C * H * W, H * W, W, 1)); + break; + } + case StorageOrder::NHWC: { + CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx( + *desc, data_type, N, CC, H, W, H * W * C, 1, W * C, C)); + break; + } + default: { + LOG(FATAL) << "Unknown storage order: " << order_; + } + } + } + + std::vector<std::int64_t> cudnn_input_dims_; + std::vector<std::int64_t> cudnn_filter_dims_; CuDNNWrapper cudnn_wrapper_; cudnnTensorDescriptor_t bottom_desc_; cudnnFilterDescriptor_t filter_desc_; cudnnTensorDescriptor_t bias_desc_; cudnnTensorDescriptor_t top_desc_; + cudnnTensorDescriptor_t top_desc_for_bias_; cudnnConvolutionDescriptor_t conv_desc_; + const size_t cudnn_ws_nbytes_limit_; size_t cudnn_ws_nbytes_; bool exhaustive_search_; bool deterministic_; size_t cudnn_state_; - vector<int> force_algo_; // stored as FWD, dFILTER, dDATA + std::vector<int> force_algo_; // stored as FWD, dFILTER, dDATA bool enable_tensor_core_; }; @@ -141,10 +177,10 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { int C = 0; switch (order_) { case StorageOrder::NHWC: - C = filter.dim32(3); + C = filter.dim32(3) * group_; break; case StorageOrder::NCHW: - C = filter.dim32(1); + C = filter.dim32(1) * group_; break; default: LOG(FATAL) << "Unknown storage order: " << order_; @@ -162,9 +198,8 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { H_out = Y->dim32(1); W_out = Y->dim32(2); CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h()); - CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w()); - CAFFE_ENFORCE_EQ(filter.dim32(3), C); + CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_); break; case StorageOrder::NCHW: N = X.dim32(0); @@ -173,13 +208,14 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { W = X.dim32(3); H_out = Y->dim32(2); W_out = Y->dim32(3); - CAFFE_ENFORCE_EQ(filter.dim32(1), C); + CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_); CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w()); break; default: LOG(FATAL) << "Unknown storage order: " << order_; } + CAFFE_ENFORCE_EQ(M % group_, 0); if (InputSize() == 3) { auto& bias = Input(BIAS); @@ -188,30 +224,29 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { } // Set up the cudnn algorithms & workspace if necessary - bool input_changed = (X.sizes() != cudnn_input_dims_); - bool filter_changed = (filter.sizes() != cudnn_filter_dims_); + const bool input_changed = (X.sizes() != cudnn_input_dims_); + const bool filter_changed = (filter.sizes() != cudnn_filter_dims_); if (input_changed || filter_changed) { VLOG(1) << "Changing the cudnn descriptor configurations."; if (input_changed) { cudnn_input_dims_ = X.sizes().vec(); - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - bottom_desc_, - GetCudnnTensorFormat(order_), - cudnnTypeWrapper<T>::type, - N, - M, - H, - W)); + SetTensor4DDescriptorWithGroup( + cudnnTypeWrapper<T>::type, N, M, H, W, &bottom_desc_); } if (filter_changed) { cudnn_filter_dims_ = filter.sizes().vec(); +#if CUDNN_VERSION_MIN(7, 0, 0) + const int MM = M; +#else + const int MM = M / group_; +#endif CUDNN_ENFORCE(cudnnSetFilter4dDescriptor( filter_desc_, cudnnTypeWrapper<T>::type, GetCudnnTensorFormat(order_), - M, - C, + MM, + C / group_, kernel_h(), kernel_w())); if (InputSize() == 3) { @@ -226,14 +261,19 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { } } // Set the output - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - top_desc_, - GetCudnnTensorFormat(order_), - cudnnTypeWrapper<T>::type, - N, - C, - H_out, - W_out)); + SetTensor4DDescriptorWithGroup( + cudnnTypeWrapper<T>::type, N, C, H_out, W_out, &top_desc_); + if (InputSize() == 3) { + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + top_desc_for_bias_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper<T>::type, + N, + C, + H_out, + W_out)); + } + // Set the convolution descriptor CAFFE_ENFORCE_EQ( pad_t(), @@ -246,7 +286,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { "The current padding scheme leads to unequal padding on the left " "and right, which is not supported by cudnn."); // Set the convolution descriptor -#if CUDNN_VERSION_MIN(6,0,0) +#if CUDNN_VERSION_MIN(6, 0, 0) CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t(), @@ -268,6 +308,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { 1, CUDNN_CROSS_CORRELATION)); #endif + #if CUDNN_VERSION_MIN(7, 0, 0) // enable TensorCore math if desired enable_tensor_core_ &= TensorCoreAvailable(); @@ -275,7 +316,10 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { CUDNN_ENFORCE( cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH)); } + // set cuDNN groups if appropriate + CUDNN_ENFORCE(cudnnSetConvolutionGroupCount(conv_desc_, group_)); #endif + if (force_algo_[ALGO_DGRAD] >= 0) { bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD]; } else if (deterministic_) { @@ -331,24 +375,56 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { VLOG(1) << "CuDNN workspace size: " << bwd_data_ws_size; } + const T* X_data = X.template data<T>(); + const T* filter_data = filter.template data<T>(); + T* Y_data = Y->template mutable_data<T>(); + // Now, actually run the computation. // Filter +#if CUDNN_VERSION_MIN(7, 0, 0) cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { CUDNN_ENFORCE(cudnnConvolutionBackwardData( state->cudnn_handle(), cudnnTypeWrapper<T>::kOne(), filter_desc_, - filter.template data<T>(), + filter_data, bottom_desc_, - X.template data<T>(), + X_data, conv_desc_, bwd_data_algo_, state->workspace().get(cudnn_ws_nbytes_), cudnn_ws_nbytes_, cudnnTypeWrapper<T>::kZero(), top_desc_, - Y->template mutable_data<T>())); + Y_data)); }); +#else + const int X_HxW = H * W; + const int Y_HxW = H_out * W_out; + const int group_offset_X = + order_ == StorageOrder::NCHW ? M / group_ * X_HxW : M / group_; + const int group_offset_Y = + order_ == StorageOrder::NCHW ? C / group_ * Y_HxW : C / group_; + const int group_offset_filter = filter.numel() / group_; + for (int i = 0; i < group_; ++i) { + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + CUDNN_ENFORCE( + cudnnConvolutionBackwardData(state->cudnn_handle(), + cudnnTypeWrapper<T>::kOne(), + filter_desc_, + filter_data + i * group_offset_filter, + bottom_desc_, + X_data + i * group_offset_X; + conv_desc_, + bwd_data_algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper<T_DX>::kZero(), + top_desc_, + Y_data + i * group_offset_Y)); + }); + } +#endif // Bias if (InputSize() == 3) { CUDNN_ENFORCE(cudnnAddTensor( @@ -357,7 +433,7 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { bias_desc_, Input(BIAS).template data<T>(), cudnnTypeWrapper<T>::kOne(), - top_desc_, + top_desc_for_bias_, Y->template mutable_data<T>())); } // Done. @@ -368,19 +444,19 @@ bool CudnnConvTransposeOp<T>::RunOnDevice() { // consolidating them. template <typename T> bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { - auto& X = Input(INPUT); - auto& filter = Input(FILTER); - auto& dY = Input(OUTPUT_GRAD); + const auto& X = Input(INPUT); + const auto& filter = Input(FILTER); + const auto& dY = Input(OUTPUT_GRAD); CAFFE_ENFORCE_EQ(X.dim(), 4); CAFFE_ENFORCE_EQ(filter.dim(), 4); int C = 0; switch (order_) { case StorageOrder::NHWC: - C = filter.dim32(3); + C = filter.dim32(3) * group_; break; case StorageOrder::NCHW: - C = filter.dim32(1); + C = filter.dim32(1) * group_; break; default: LOG(FATAL) << "Unknown storage order: " << order_; @@ -398,7 +474,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w()); - CAFFE_ENFORCE_EQ(filter.dim32(3), C); + CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_); break; case StorageOrder::NCHW: N = X.dim32(0); @@ -407,41 +483,42 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { W = X.dim32(3); H_out = dY.dim32(2); W_out = dY.dim32(3); - CAFFE_ENFORCE_EQ(filter.dim32(1), C); + CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_); CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w()); break; default: LOG(FATAL) << "Unknown storage order: " << order_; } + CAFFE_ENFORCE_EQ(M % group_, 0); + // Since we only handle LegacyPadding::NOTSET, we don't need to // compute padding. auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>()); // Set up the cudnn algorithms & workspace if necessary - bool input_changed = (X.sizes() != cudnn_input_dims_); - bool filter_changed = (filter.sizes() != cudnn_filter_dims_); + const bool input_changed = (X.sizes() != cudnn_input_dims_); + const bool filter_changed = (filter.sizes() != cudnn_filter_dims_); if (input_changed || filter_changed) { VLOG(1) << "Changing the cudnn descriptor configurations."; if (input_changed) { cudnn_input_dims_ = X.sizes().vec(); - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - bottom_desc_, - GetCudnnTensorFormat(order_), - cudnnTypeWrapper<T>::type, - N, - M, - H, - W)); + SetTensor4DDescriptorWithGroup( + cudnnTypeWrapper<T>::type, N, M, H, W, &bottom_desc_); } if (filter_changed) { cudnn_filter_dims_ = filter.sizes().vec(); +#if CUDNN_VERSION_MIN(7, 0, 0) + const int MM = M; +#else + const int MM = M / group_; +#endif CUDNN_ENFORCE(cudnnSetFilter4dDescriptor( filter_desc_, cudnnTypeWrapper<T>::type, GetCudnnTensorFormat(order_), - M, - C, + MM, + C / group_, kernel_h(), kernel_w())); if (!no_bias_) { @@ -456,14 +533,19 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { } } // Set the output - CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( - top_desc_, - GetCudnnTensorFormat(order_), - cudnnTypeWrapper<T>::type, - N, - C, - H_out, - W_out)); + SetTensor4DDescriptorWithGroup( + cudnnTypeWrapper<T>::type, N, C, H_out, W_out, &top_desc_); + if (!no_bias_) { + CUDNN_ENFORCE(cudnnSetTensor4dDescriptor( + top_desc_for_bias_, + GetCudnnTensorFormat(order_), + cudnnTypeWrapper<T>::type, + N, + C, + H_out, + W_out)); + } + // Set the convolution descriptor CAFFE_ENFORCE_EQ( pad_t(), @@ -475,7 +557,7 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { pad_r(), "The current padding scheme leads to unequal padding on the left " "and right, which is not supported by cudnn."); -#if CUDNN_VERSION_MIN(6,0,0) +#if CUDNN_VERSION_MIN(6, 0, 0) CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor( conv_desc_, pad_t(), @@ -504,6 +586,8 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { CUDNN_ENFORCE( cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH)); } + // set cuDNN groups if appropriate + CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_)); #endif if (force_algo_[ALGO_WGRAD] >= 0) { bwd_filter_algo_ = @@ -622,13 +706,14 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { CUDNN_ENFORCE(cudnnConvolutionBackwardBias( cudnn_wrapper_.inline_cudnn_handle(), cudnnTypeWrapper<T>::kOne(), - top_desc_, + top_desc_for_bias_, dY.template data<T>(), cudnnTypeWrapper<T>::kZero(), bias_desc_, dbias->template mutable_data<T>())); } +#if CUDNN_VERSION_MIN(7, 0, 0) cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { CUDNN_ENFORCE(cudnnConvolutionBackwardFilter( state->cudnn_handle(), @@ -647,7 +732,6 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { // Compute the gradient w.r.t. the input. - auto* dX = Output( no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), @@ -668,6 +752,55 @@ bool CudnnConvTransposeGradientOp<T>::RunOnDevice() { dX->template mutable_data<T>())); } }); +#else + const int X_HxW = H * W; + const int Y_HxW = H_out * W_out; + const int group_offset_X = + order_ == StorageOrder::NCHW ? M / group_ * X_HxW : M / group_; + const int group_offset_Y = + order_ == StorageOrder::NCHW ? C / group_ * Y_HxW : C / group_; + const int group_offset_filter = filter.numel() / group_; + for (int i = 0; i < group_; ++i) { + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + CUDNN_ENFORCE(cudnnConvolutionBackwardFilter( + state->cudnn_handle(), + cudnnTypeWrapper<T>::kOne(), + top_desc_, + dY.template data<T>() + i * group_offset_Y, + bottom_desc_, + X.template data<T>() + i * group_offset_X, + conv_desc_, + bwd_filter_algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper<T>::kZero(), + filter_desc_, + dfilter->template mutable_data<T>() + i * group_offset_filter)); + if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { + // Compute the gradient w.r.t. the input. + auto* dX = Output( + no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, + X.sizes(), + at::dtype<T>()); + cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) { + CUDNN_ENFORCE(cudnnConvolutionForward( + state->cudnn_handle(), + cudnnTypeWrapper<T>::kOne(), + top_desc_, + dY.template data<T>() + i * group_offset_Y, + filter_desc_, + filter.template data<T>() + i * group_offset_filter, + conv_desc_, + algo_, + state->workspace().get(cudnn_ws_nbytes_), + cudnn_ws_nbytes_, + cudnnTypeWrapper<T>::kZero(), + bottom_desc_, + dX->template mutable_data<T>() + i * group_offset_X)); + }); + } + } +#endif return true; } diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h index 41af81c2c6..333f782920 100644 --- a/caffe2/operators/conv_transpose_op_impl.h +++ b/caffe2/operators/conv_transpose_op_impl.h @@ -3,11 +3,15 @@ #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_ #define CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_ +#include "caffe2/operators/conv_transpose_op.h" + +#include <array> +#include <vector> + #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/operators/conv_op_shared.h" -#include "caffe2/operators/conv_transpose_op.h" #include "caffe2/operators/conv_transpose_unpool_op_base.h" #include "caffe2/utils/math.h" @@ -17,551 +21,618 @@ namespace caffe2 { template <typename T, class Context> bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() { - const Tensor& X = Input(INPUT); - auto& filter = Input(FILTER); - const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3); - CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor"); - CAFFE_ENFORCE( - filter.dim32(0) == M, - "filter number must be equal to input channel number"); - const int C = filter.dim32(1); - CAFFE_ENFORCE( - filter.dim32(2) == this->kernel_h(), + const auto& X = Input(INPUT); + const auto& filter = Input(FILTER); + CAFFE_ENFORCE_EQ(X.dim(), 4, "Input must be 4D tensor"); + CAFFE_ENFORCE_EQ(filter.dim(), 4, "filter must be 4D tensor"); + const int N = X.dim32(0); + const int M = X.dim32(1); + const int H = X.dim32(2); + const int W = X.dim32(3); + const int G = group_; + CAFFE_ENFORCE_EQ(M, filter.dim32(0)); + CAFFE_ENFORCE_EQ( + M % G, 0, "The number of input channels is not divisible by group."); + const int C = filter.dim32(1) * G; + CAFFE_ENFORCE_EQ( + filter.dim32(2), + kernel_h(), "filter height must be equal to kernel height"); - CAFFE_ENFORCE( - filter.dim32(3) == this->kernel_w(), + CAFFE_ENFORCE_EQ( + filter.dim32(3), + this->kernel_w(), "filter width must be equal to kernel width"); - auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C); - Tensor* Y = Output(0, sizes, at::dtype<T>()); + const std::vector<std::int64_t> Y_dims = + ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C); + auto* Y = Output(0, Y_dims, at::dtype<T>()); - const int kernel_dim = C * this->kernel_h() * this->kernel_w(); - const int input_image_size = H * W; - const int output_image_size = Y->dim32(2) * Y->dim32(3); + if (N == 0) { + return true; + } + const int K_HxW = kernel_h() * kernel_w(); + const int kernel_dim = C / G * K_HxW; + const int X_HxW = H * W; + const int Y_HxW = Y->dim32(2) * Y->dim32(3); + + const T* X_data = X.template data<T>(); + const T* filter_data = filter.template data<T>(); + const T* bias_data = nullptr; if (InputSize() == 3) { auto& bias = Input(BIAS); - CAFFE_ENFORCE(bias.dim() == 1, "bias must be 1D tensor"); - CAFFE_ENFORCE( - bias.dim32(0) == C, + CAFFE_ENFORCE_EQ(bias.dim(), 1, "bias must be 1D tensor"); + CAFFE_ENFORCE_EQ( + bias.dim32(0), + C, "bias dimension must be equal to output channel number"); - ReinitializeTensor( - &bias_multiplier_, - {1, output_image_size}, - at::dtype<T>().device(Context::GetDeviceType())); - T* bm_data = bias_multiplier_.template mutable_data<T>(); - math::Set<T, Context>( - output_image_size, - static_cast<T>(1), - bm_data, - &context_); + bias_data = bias.template data<T>(); } + T* Y_data = Y->template mutable_data<T>(); - const T* Xdata = X.template data<T>(); - const T* filter_data = filter.template data<T>(); - T* Ydata = Y->template mutable_data<T>(); + const std::vector<std::int64_t> buffer_shape = { + C, kernel_h(), kernel_w(), H, W}; - auto f = [&](Tensor* col_buffer) { - ReinitializeTensor(col_buffer, vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W}, at::dtype<T>().device(Context::GetDeviceType())); + const auto func = [&](Tensor* col_buffer) { + ReinitializeTensor( + col_buffer, + buffer_shape, + at::dtype<T>().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer->template mutable_data<T>(); - for (auto image_id = 0; image_id < N; ++image_id) { + for (int image_id = 0; image_id < N; ++image_id) { // Weight term - math::Gemm<T, Context>( - CblasTrans, - CblasNoTrans, - kernel_dim, - input_image_size, - M, - 1, - filter_data, - Xdata, - 0, - col_buffer_data, - &context_); + if (G == 1) { + math::Gemm<T, Context>( + CblasTrans, + CblasNoTrans, + kernel_dim, + X_HxW, + M, + 1.0f, + filter_data, + X_data + image_id * M * X_HxW, + 0.0f, + col_buffer_data, + &context_); + } else { + math::GemmStridedBatched<T, Context>( + CblasTrans, + CblasNoTrans, + G, + kernel_dim, + X_HxW, + M / G, + 1.0f, + filter_data, + M / G * kernel_dim, + X_data + image_id * M * X_HxW, + M / G * X_HxW, + 0.0f, + col_buffer_data, + col_buffer->numel() / G, + &context_); + } // Col2Im math::Col2Im<T, Context, StorageOrder::NCHW>( C, Y->dim32(2), Y->dim32(3), - this->kernel_h(), - this->kernel_w(), + kernel_h(), + kernel_w(), 1, 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), + pad_t(), + pad_l(), + pad_b(), + pad_r(), + stride_h(), + stride_w(), col_buffer_data, - Ydata, + Y_data + image_id * C * Y_HxW, &context_); - // Bias term - if (InputSize() == 3) { - const T* bias_data = Input(BIAS).template data<T>(); - const T* bm_data = bias_multiplier_.template data<T>(); -#if !defined(__ARM_NEON__) && !defined(__ARM_NEON) - math::Gemm<T, Context>( - CblasNoTrans, - CblasNoTrans, - C, - output_image_size, - 1, - 1, - bias_data, - bm_data, - 1, - Ydata, - &context_); -#else + if (bias_data != nullptr) { + // Bias term +#if defined(__ARM_NEON__) || defined(__ARM_NEON) math::BiasCHW<T, Context>( bias_data, - bm_data, + nullptr, C, - output_image_size, - Ydata, + Y_HxW, + Y_data + image_id * C * Y_HxW, &context_); #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON) } - - Xdata += M * H * W; - Ydata += Y->numel() / Y->dim32(0); + } + if (bias_data != nullptr) { +#if !defined(__ARM_NEON__) && !defined(__ARM_NEON) + // Bias term + const std::array<int, 3> Y_dims = {N, C, Y_HxW}; + const std::array<int, 3> b_dims = {1, C, 1}; + math::Add<T, Context>( + 3, + Y_dims.data(), + 3, + b_dims.data(), + Y_data, + bias_data, + Y_data, + &context_); +#endif } }; + if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) { - runWithSharedBuffer<Context>(ws_, f); + runWithSharedBuffer<Context>(ws_, func); } else { - f(&col_buffer_); + func(&col_buffer_); } return true; } template <typename T, class Context> bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() { - const Tensor& X = Input(INPUT); + const auto& X = Input(INPUT); auto& filter = Input(FILTER); - const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3); - CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor"); - CAFFE_ENFORCE( - filter.dim32(0) == M, + CAFFE_ENFORCE_EQ(filter.dim(), 4, "filter must be 4D tensor"); + const int N = X.dim32(0); + const int H = X.dim32(1); + const int W = X.dim32(2); + const int M = X.dim32(3); + const int G = group_; + CAFFE_ENFORCE_EQ( + filter.dim32(0), + M, "filter number must be equal to input channel number"); - CAFFE_ENFORCE( - filter.dim32(1) == this->kernel_h(), + CAFFE_ENFORCE_EQ( + M % G, 0, "The number of input channels is not divisible by group."); + const int C = filter.dim32(3) * G; + CAFFE_ENFORCE_EQ( + filter.dim32(1), + kernel_h(), "filter height must be equal to kernel height"); - CAFFE_ENFORCE( - filter.dim32(2) == this->kernel_w(), + CAFFE_ENFORCE_EQ( + filter.dim32(2), + kernel_w(), "filter width must be equal to kernel width"); - const int C = filter.dim32(3); - auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C); - Tensor* Y = Output(0, sizes, at::dtype<T>()); - const auto kernel_dim = C * this->kernel_h() * this->kernel_w(); - const auto input_image_size = H * W; - const auto output_image_size = Y->dim32(1) * Y->dim32(2); + const std::vector<std::int64_t> Y_dims = + ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C); + auto* Y = Output(0, Y_dims, at::dtype<T>()); + if (N == 0) { + return true; + } + + const int K_HxW = kernel_h() * kernel_w(); + const int kernel_dim = C / G * K_HxW; + const int X_HxW = H * W; + const int Y_HxW = Y->dim32(1) * Y->dim32(2); + const T* X_data = X.template data<T>(); + const T* filter_data = filter.template data<T>(); + const T* bias_data = nullptr; if (InputSize() == 3) { auto& bias = Input(BIAS); - CAFFE_ENFORCE(bias.dim() == 1, "bias must be 1D tensor"); - CAFFE_ENFORCE( - bias.dim32(0) == C, + CAFFE_ENFORCE_EQ(bias.dim(), 1, "bias must be 1D tensor"); + CAFFE_ENFORCE_EQ( + bias.dim32(0), + C, "bias dimension must be equal to output channel number"); - // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size - ReinitializeTensor( - &bias_multiplier_, - {1, output_image_size}, - at::dtype<T>().device(Context::GetDeviceType())); - T* bm_data = bias_multiplier_.template mutable_data<T>(); - math::Set<T, Context>( - output_image_size, - static_cast<T>(1), - bm_data, - &context_); + bias_data = bias.template data<T>(); } - const T* Xdata = X.template data<T>(); - const T* filter_data = filter.template data<T>(); - T* Ydata = Y->template mutable_data<T>(); + T* Y_data = Y->template mutable_data<T>(); - auto f = [&](Tensor* /*col_buffer*/) { + const std::vector<std::int64_t> buffer_shape = { + G, H, W, kernel_h(), kernel_w(), C / G}; + const auto func = [&](Tensor* /*col_buffer*/) { ReinitializeTensor( &col_buffer_, - vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C}, + buffer_shape, at::dtype<T>().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer_.template mutable_data<T>(); - for (auto image_id = 0; image_id < N; ++image_id) { + for (int image_id = 0; image_id < N; ++image_id) { // Weight term - math::Gemm<T, Context>( - CblasNoTrans, - CblasNoTrans, - input_image_size, - kernel_dim, - M, - 1, - Xdata, - filter_data, - 0, - col_buffer_data, - &context_); + if (G == 1) { + math::Gemm<T, Context>( + CblasNoTrans, + CblasNoTrans, + X_HxW, + kernel_dim, + M, + 1.0f, + X_data + image_id * M * X_HxW, + filter_data, + 0.0f, + col_buffer_data, + &context_); + } else { + for (int group_id = 0; group_id < G; ++group_id) { + math::GemmEx<T, Context>( + CblasNoTrans, + CblasNoTrans, + X_HxW, + kernel_dim, + M / G, + 1.0f, + X_data + image_id * M * X_HxW + group_id * M / G, + M, + filter_data + group_id * M / G * kernel_dim, + kernel_dim, + 0.0f, + col_buffer_data + group_id * kernel_dim, + G * kernel_dim, + &context_); + } + } // Col2Im math::Col2Im<T, Context, StorageOrder::NHWC>( C, Y->dim32(1), Y->dim32(2), - this->kernel_h(), - this->kernel_w(), + kernel_h(), + kernel_w(), 1, 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), + pad_t(), + pad_l(), + pad_b(), + pad_r(), + stride_h(), + stride_w(), col_buffer_data, - Ydata, - &context_); + Y_data + image_id * C * Y_HxW, + &context_, + G); + } + if (bias_data != nullptr) { // Bias term - if (InputSize() == 3) { - const T* bm_data = bias_multiplier_.template data<T>(); - const T* bias_data = Input(BIAS).template data<T>(); - math::Gemm<T, Context>( - CblasNoTrans, - CblasNoTrans, - output_image_size, - C, - 1, - 1, - bm_data, - bias_data, - 1, - Ydata, - &context_); - } - Xdata += M * H * W; - Ydata += Y->numel() / Y->dim32(0); + const std::array<int, 2> Y_dims = {N * Y_HxW, C}; + const std::array<int, 2> b_dims = {1, C}; + math::Add<T, Context>( + 2, + Y_dims.data(), + 2, + b_dims.data(), + Y_data, + bias_data, + Y_data, + &context_); } }; + if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) { - runWithSharedBuffer<Context>(ws_, f); + runWithSharedBuffer<Context>(ws_, func); } else { - f(&col_buffer_); + func(&col_buffer_); } return true; } template <typename T, class Context> bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() { - auto& X = Input(INPUT); - auto& filter = Input(FILTER); - auto& dY = Input(OUTPUT_GRAD); - - const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3); - // We only handle LegacyPadding::NOTSET case and ignore cases of - // LegacyPadding::VALID and LegacyPadding::SAME - // Thus, we don't need to manually compute padding values - // We simply use the values from the user - CAFFE_ENFORCE(filter.dim() == 4); - const int C = filter.dim32(1); - CAFFE_ENFORCE( - filter.dim32(2) == this->kernel_h(), + const auto& X = Input(INPUT); + const auto& filter = Input(FILTER); + const auto& dY = Input(OUTPUT_GRAD); + CAFFE_ENFORCE_EQ(filter.dim(), 4); + const int N = X.dim32(0); + const int M = X.dim32(1); + const int H = X.dim32(2); + const int W = X.dim32(3); + const int G = group_; + CAFFE_ENFORCE_EQ(M, filter.dim32(0)); + CAFFE_ENFORCE_EQ( + M % G, 0, "The number of input channels is not divisible by group."); + const int C = filter.dim32(1) * G; + CAFFE_ENFORCE_EQ(C, dY.dim32(1)); + CAFFE_ENFORCE_EQ( + filter.dim32(2), + kernel_h(), "filter height must be equal to kernel height"); - CAFFE_ENFORCE( - filter.dim32(3) == this->kernel_w(), + CAFFE_ENFORCE_EQ( + filter.dim32(3), + this->kernel_w(), "filter width must be equal to kernel width"); + + const int K_HxW = kernel_h() * kernel_w(); + const int kernel_dim = C / G * K_HxW; + const int X_HxW = H * W; + const int Y_HxW = dY.dim32(2) * dY.dim32(3); auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>()); - const int kernel_dim = C * this->kernel_h() * this->kernel_w(); - const int output_image_size = dY.dim32(2) * dY.dim32(3); - // The col buffer is stored in CHW order as well - ReinitializeTensor( - &col_buffer_, - vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W}, - at::dtype<T>().device(Context::GetDeviceType())); - if (!no_bias_) { - auto* dbias = Output(BIAS_OR_INPUT_GRAD); - dbias->Resize(C); - // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size - ReinitializeTensor( - &bias_multiplier_, - {1, output_image_size}, - at::dtype<T>().device(Context::GetDeviceType())); - T* bm_data = bias_multiplier_.template mutable_data<T>(); - math::Set<T, Context>( - output_image_size, - static_cast<T>(1), - bm_data, - &context_); - } - T* col_buffer_data = col_buffer_.template mutable_data<T>(); - const T* Xdata = X.template data<T>(); + const T* X_data = X.template data<T>(); const T* filter_data = filter.template data<T>(); - const T* dYdata = dY.template data<T>(); + const T* dY_data = dY.template data<T>(); T* dfilter_data = dfilter->template mutable_data<T>(); - // Pre-setting the gradients to zero - math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_); + T* dbias_data = nullptr; + T* dX_data = nullptr; if (!no_bias_) { - auto* dbias = Output(BIAS_OR_INPUT_GRAD); - T* dbias_data = dbias->template mutable_data<T>(); - math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_); + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {C}, at::dtype<T>()); + dbias_data = dbias->template mutable_data<T>(); } - for (auto image_id = 0; image_id < N; ++image_id) { + const bool compute_dX = + (OutputSize() == 3) || (no_bias_ && (OutputSize() == 2)); + if (compute_dX) { + auto* dX = Output( + no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); + dX_data = dX->template mutable_data<T>(); + } + math::Set<T, Context>(filter.numel(), T(0), dfilter_data, &context_); + + if (N == 0) { + math::Set<T, Context>(C, T(0), dbias_data, &context_); + return true; + } + + ReinitializeTensor( + &col_buffer_, + std::vector<std::int64_t>{C, kernel_h(), kernel_w(), H, W}, + at::dtype<T>().device(Context::GetDeviceType())); + T* col_buffer_data = col_buffer_.template mutable_data<T>(); + + for (int image_id = 0; image_id < N; ++image_id) { // gradient w.r.t. filters. Im2Col followed by Gemm // Im2Col. math::Im2Col<T, Context, StorageOrder::NCHW>( C, dY.dim32(2), dY.dim32(3), - this->kernel_h(), - this->kernel_w(), + kernel_h(), + kernel_w(), 1, 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), - dYdata, + pad_t(), + pad_l(), + pad_b(), + pad_r(), + stride_h(), + stride_w(), + dY_data + image_id * C * Y_HxW, col_buffer_data, &context_); // Gemm - math::Gemm<T, Context>( - CblasNoTrans, - CblasTrans, - M, - kernel_dim, - H * W, - 1, - Xdata, - col_buffer_data, - 1, - dfilter_data, - &context_); - // gradient w.r.t. bias - if (!no_bias_) { - const T* bm_data = bias_multiplier_.template data<T>(); - T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>(); + if (G == 1) { math::Gemm<T, Context>( CblasNoTrans, - CblasNoTrans, - C, - 1, - output_image_size, - 1, - dYdata, - bm_data, - 1, - input_grad_data, - &context_); - } - dYdata += dY.numel() / dY.dim32(0); - Xdata += X.numel() / X.dim32(0); - } - if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { - // Compute gradients w.r.t. the input - // Since we have changed dYdata in the above loop, we will need to reset. - dYdata = dY.template data<T>(); - - auto* dX = Output( - no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); - T* dXdata = dX->template mutable_data<T>(); - for (auto image_id = 0; image_id < N; ++image_id) { - // Im2Col. - // TODO(zyan3): Probably duplicate work as in gradient computation - // w.r.t filters - math::Im2Col<T, Context, StorageOrder::NCHW>( - C, - dY.dim32(2), - dY.dim32(3), - this->kernel_h(), - this->kernel_w(), - 1, - 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), - dYdata, + CblasTrans, + M, + kernel_dim, + X_HxW, + 1.0f, + X_data + image_id * M * X_HxW, col_buffer_data, + 1.0f, + dfilter_data, &context_); - // Gemm - math::Gemm<T, Context>( + } else { + math::GemmStridedBatched<T, Context>( CblasNoTrans, - CblasNoTrans, - M, - H * W, + CblasTrans, + G, + M / G, kernel_dim, - 1, - filter_data, + X_HxW, + 1.0f, + X_data + image_id * M * X_HxW, + M / G * X_HxW, col_buffer_data, - 0, - dXdata, + col_buffer_.numel() / G, + 1.0f, + dfilter_data, + M / G * kernel_dim, &context_); - dYdata += dY.numel() / dY.dim32(0); - dXdata += X.numel() / X.dim32(0); + } + + if (dX_data != nullptr) { + // Compute gradients w.r.t. the input + if (G == 1) { + math::Gemm<T, Context>( + CblasNoTrans, + CblasNoTrans, + M, + X_HxW, + kernel_dim, + 1.0f, + filter_data, + col_buffer_data, + 0.0f, + dX_data + image_id * M * X_HxW, + &context_); + } else { + math::GemmStridedBatched<T, Context>( + CblasNoTrans, + CblasNoTrans, + G, + M / G, + X_HxW, + kernel_dim, + 1.0f, + filter_data, + M / G * kernel_dim, + col_buffer_data, + col_buffer_.numel() / G, + 0.0f, + dX_data + image_id * M * X_HxW, + M / G * X_HxW, + &context_); + } } } + + if (dbias_data != nullptr) { + // gradient w.r.t. bias + const std::array<int, 3> Y_dims = {N, C, Y_HxW}; + const std::array<int, 3> b_dims = {1, C, 1}; + math::ReduceSum<T, Context>( + 3, Y_dims.data(), b_dims.data(), T(1), dY_data, dbias_data, &context_); + } + return true; } template <typename T, class Context> bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() { - auto& X = Input(INPUT); - auto& filter = Input(FILTER); - auto& dY = Input(OUTPUT_GRAD); - - const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3); - // We only handle LegacyPadding::NOTSET case and ignore cases of - // LegacyPadding::VALID and LegacyPadding::SAME - // Thus, we don't need to manually compute padding values - // We simply use the values from the user - CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor"); - CAFFE_ENFORCE( - filter.dim32(1) == this->kernel_h(), + const auto& X = Input(INPUT); + const auto& filter = Input(FILTER); + const auto& dY = Input(OUTPUT_GRAD); + CAFFE_ENFORCE_EQ(filter.dim(), 4); + const int N = X.dim32(0); + const int H = X.dim32(1); + const int W = X.dim32(2); + const int M = X.dim32(3); + const int G = group_; + CAFFE_ENFORCE_EQ(M, filter.dim32(0)); + CAFFE_ENFORCE_EQ( + M % G, 0, "The number of input channels is not divisible by group."); + const int C = filter.dim32(3) * G; + CAFFE_ENFORCE_EQ(C, dY.dim32(3)); + CAFFE_ENFORCE_EQ( + filter.dim32(1), + kernel_h(), "filter height must be equal to kernel height"); - CAFFE_ENFORCE( - filter.dim32(2) == this->kernel_w(), + CAFFE_ENFORCE_EQ( + filter.dim32(2), + this->kernel_w(), "filter width must be equal to kernel width"); - const int C = filter.dim32(3); + CAFFE_ENFORCE_EQ(dY.dim32(3), C); + + const int K_HxW = kernel_h() * kernel_w(); + const int kernel_dim = C / G * K_HxW; + const int X_HxW = H * W; + const int Y_HxW = dY.dim32(1) * dY.dim32(2); auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>()); - const int kernel_dim = C * this->kernel_h() * this->kernel_w(); - const int output_image_size = dY.dim32(1) * dY.dim32(2); - // The col buffer is stored in HWC order as well - ReinitializeTensor( - &col_buffer_, - vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C}, - at::dtype<T>().device(Context::GetDeviceType())); - if (!no_bias_) { - auto* dbias = Output(BIAS_OR_INPUT_GRAD); - dbias->Resize(C); - // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size - ReinitializeTensor( - &bias_multiplier_, - {1, output_image_size}, - at::dtype<T>().device(Context::GetDeviceType())); - T* bm_data = bias_multiplier_.template mutable_data<T>(); - math::Set<T, Context>( - output_image_size, - static_cast<T>(1), - bm_data, - &context_); - } - T* col_buffer_data = col_buffer_.template mutable_data<T>(); - const T* Xdata = X.template data<T>(); + const T* X_data = X.template data<T>(); const T* filter_data = filter.template data<T>(); - const T* dYdata = dY.template data<T>(); + const T* dY_data = dY.template data<T>(); T* dfilter_data = dfilter->template mutable_data<T>(); - // Pre-setting the gradients to zero - math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_); + T* dbias_data = nullptr; + T* dX_data = nullptr; if (!no_bias_) { - auto* dbias = Output(BIAS_OR_INPUT_GRAD); - T* dbias_data = dbias->template mutable_data<T>(); - math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_); + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {C}, at::dtype<T>()); + dbias_data = dbias->template mutable_data<T>(); + } + const bool compute_dX = + (OutputSize() == 3) || (no_bias_ && (OutputSize() == 2)); + if (compute_dX) { + auto* dX = Output( + no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); + dX_data = dX->template mutable_data<T>(); } - for (auto image_id = 0; image_id < N; ++image_id) { + math::Set<T, Context>(filter.numel(), T(0), dfilter_data, &context_); + + if (N == 0) { + math::Set<T, Context>(C, T(0), dbias_data, &context_); + return true; + } + + ReinitializeTensor( + &col_buffer_, + std::vector<std::int64_t>{C, kernel_h(), kernel_w(), H, W}, + at::dtype<T>().device(Context::GetDeviceType())); + T* col_buffer_data = col_buffer_.template mutable_data<T>(); + + for (int image_id = 0; image_id < N; ++image_id) { // gradient w.r.t. filters. Im2Col followed by Gemm // Im2Col. math::Im2Col<T, Context, StorageOrder::NHWC>( C, dY.dim32(1), dY.dim32(2), - this->kernel_h(), - this->kernel_w(), + kernel_h(), + kernel_w(), 1, 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), - dYdata, + pad_t(), + pad_l(), + pad_b(), + pad_r(), + stride_h(), + stride_w(), + dY_data + image_id * C * Y_HxW, col_buffer_data, - &context_); + &context_, + G); // Gemm - math::Gemm<T, Context>( - CblasTrans, - CblasNoTrans, - M, - kernel_dim, - H * W, - 1, - Xdata, - col_buffer_data, - 1, - dfilter_data, - &context_); - // gradients w.r.t. bias - if (!no_bias_) { - const T* bm_data = bias_multiplier_.template data<T>(); - T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>(); + if (G == 1) { math::Gemm<T, Context>( CblasTrans, CblasNoTrans, - C, - 1, - output_image_size, - 1, - dYdata, - bm_data, - 1, - input_grad_data, - &context_); - } - dYdata += dY.numel() / dY.dim32(0); - Xdata += X.numel() / X.dim32(0); - } - if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) { - // Compute gradients w.r.t. the input - // Since we have changed dYdata in the above loop, we will need to reset. - dYdata = dY.template data<T>(); - - auto* dX = Output( - no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); - T* dXdata = dX->template mutable_data<T>(); - for (auto image_id = 0; image_id < N; ++image_id) { - // Im2Col. - // TODO(zyan3): Probably duplicate work as in gradient computation - // w.r.t filters - math::Im2Col<T, Context, StorageOrder::NHWC>( - C, - dY.dim32(1), - dY.dim32(2), - this->kernel_h(), - this->kernel_w(), - 1, - 1, - this->pad_t(), - this->pad_l(), - this->pad_b(), - this->pad_r(), - this->stride_h(), - this->stride_w(), - dYdata, - col_buffer_data, - &context_); - // Gemm - math::Gemm<T, Context>( - CblasNoTrans, - CblasTrans, - H * W, M, kernel_dim, - 1, + X_HxW, + 1.0f, + X_data + image_id * M * X_HxW, col_buffer_data, - filter_data, - 0, - dXdata, + 1.0f, + dfilter_data, &context_); - dYdata += dY.numel() / dY.dim32(0); - dXdata += X.numel() / X.dim32(0); + } else { + for (int group_id = 0; group_id < G; ++group_id) { + math::GemmEx<T, Context>( + CblasTrans, + CblasNoTrans, + M / G, + kernel_dim, + X_HxW, + 1.0f, + X_data + image_id * M * X_HxW + group_id * M / G, + M, + col_buffer_data + group_id * kernel_dim, + G * kernel_dim, + 1.0f, + dfilter_data + group_id * M / G * kernel_dim, + kernel_dim, + &context_); + } + } + + if (dX_data != nullptr) { + // Compute gradients w.r.t. the input + if (G == 1) { + math::Gemm<T, Context>( + CblasNoTrans, + CblasTrans, + X_HxW, + M, + kernel_dim, + 1.0f, + col_buffer_data, + filter_data, + 0.0f, + dX_data + image_id * M * X_HxW, + &context_); + } else { + for (int group_id = 0; group_id < G; ++group_id) { + math::GemmEx<T, Context>( + CblasNoTrans, + CblasTrans, + X_HxW, + M / G, + kernel_dim, + 1.0f, + col_buffer_data + group_id * kernel_dim, + G * kernel_dim, + filter_data + group_id * M / G * kernel_dim, + kernel_dim, + 0.0f, + dX_data + image_id * M * X_HxW + group_id * M / G, + M, + &context_); + } + } } } + + if (dbias_data != nullptr) { + const std::array<int, 2> Y_dims = {N * Y_HxW, C}; + const std::array<int, 2> b_dims = {1, C}; + math::ReduceSum<T, Context>( + 2, Y_dims.data(), b_dims.data(), T(1), dY_data, dbias_data, &context_); + } + return true; } } // namespace caffe2 + #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_ diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h index 7ebfda7e09..c98b3bae21 100644 --- a/caffe2/operators/conv_transpose_unpool_op_base.h +++ b/caffe2/operators/conv_transpose_unpool_op_base.h @@ -17,7 +17,9 @@ template <class Context> class ConvTransposeUnpoolBase : public Operator<Context> { public: USE_OPERATOR_CONTEXT_FUNCTIONS; - explicit ConvTransposeUnpoolBase(const OperatorDef& operator_def, Workspace* ws) + explicit ConvTransposeUnpoolBase( + const OperatorDef& operator_def, + Workspace* ws) : Operator<Context>(operator_def, ws), legacy_pad_( static_cast<LegacyPadding>(this->template GetSingleArgument<int>( @@ -27,6 +29,7 @@ class ConvTransposeUnpoolBase : public Operator<Context> { stride_(this->template GetRepeatedArgument<int>("strides")), pads_(this->template GetRepeatedArgument<int>("pads")), adj_(this->template GetRepeatedArgument<int>("adjs")), + group_(this->template GetSingleArgument<int>("group", 1)), order_(StringToStorageOrder( this->template GetSingleArgument<string>("order", "NCHW"))), shared_buffer_( @@ -206,19 +209,7 @@ class ConvTransposeUnpoolBase : public Operator<Context> { virtual ~ConvTransposeUnpoolBase() {} - private: - LegacyPadding legacy_pad_; - int pad_; - protected: - vector<int> kernel_; - vector<int> stride_; - vector<int> pads_; - vector<int> adj_; - StorageOrder order_; - bool shared_buffer_; - Workspace* ws_; - // Accessors for 2D conv params. inline int pad_t() const { @@ -289,14 +280,35 @@ class ConvTransposeUnpoolBase : public Operator<Context> { break; } } + + LegacyPadding legacy_pad_; + int pad_; + + std::vector<int> kernel_; + std::vector<int> stride_; + std::vector<int> pads_; + std::vector<int> adj_; + int group_; + StorageOrder order_; + bool shared_buffer_; + Workspace* ws_; }; #define USE_CONV_TRANSPOSE_UNPOOL_BASE_FUNCTIONS(Context) \ USE_OPERATOR_FUNCTIONS(Context); \ using ConvTransposeUnpoolBase<Context>::kernel_; \ + using ConvTransposeUnpoolBase<Context>::kernel_h; \ + using ConvTransposeUnpoolBase<Context>::kernel_w; \ using ConvTransposeUnpoolBase<Context>::stride_; \ + using ConvTransposeUnpoolBase<Context>::stride_h; \ + using ConvTransposeUnpoolBase<Context>::stride_w; \ using ConvTransposeUnpoolBase<Context>::pads_; \ + using ConvTransposeUnpoolBase<Context>::pad_t; \ + using ConvTransposeUnpoolBase<Context>::pad_l; \ + using ConvTransposeUnpoolBase<Context>::pad_b; \ + using ConvTransposeUnpoolBase<Context>::pad_r; \ using ConvTransposeUnpoolBase<Context>::adj_; \ + using ConvTransposeUnpoolBase<Context>::group_; \ using ConvTransposeUnpoolBase<Context>::order_; \ using ConvTransposeUnpoolBase<Context>::shared_buffer_; \ using ConvTransposeUnpoolBase<Context>::ws_ diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py index 2a32f9acee..272ac3a3e7 100644 --- a/caffe2/python/operator_test/conv_transpose_test.py +++ b/caffe2/python/operator_test/conv_transpose_test.py @@ -6,6 +6,7 @@ import numpy as np from hypothesis import assume, given, settings import hypothesis.strategies as st +from caffe2.proto import caffe2_pb2 from caffe2.python import core, utils import caffe2.python.hypothesis_test_util as hu import caffe2.python.hip_test_util as hiputl @@ -360,6 +361,68 @@ class TestConvolutionTranspose(hu.HypothesisTestCase): for i in outputs_to_check: self.assertGradientChecks(gc, op, inputs, i, [0]) + @given(stride=st.integers(1, 3), + pad=st.integers(0, 3), + kernel=st.integers(1, 3), + adj=st.integers(0, 2), + size=st.integers(7, 10), + input_channels=st.integers(1, 8), + output_channels=st.integers(1, 8), + batch_size=st.integers(1, 4), + group=st.integers(1, 4), + order=st.sampled_from(["NCHW", "NHWC"]), + engine=st.sampled_from(["", "CUDNN", "BLOCK"]), + shared_buffer=st.booleans(), + use_bias=st.booleans(), + **hu.gcs) + def test_convolution_transpose_with_group( + self, stride, pad, kernel, adj, size, input_channels, + output_channels, batch_size, group, order, engine, shared_buffer, + use_bias, gc, dc): + assume(adj < stride) + # TODO: Group conv_transpose in NHWC not implemented for GPU yet. + assume(group == 1 or order == "NCHW" or + gc.device_type == caffe2_pb2.CPU) + if group != 1 and order == "NHWC": + dc = [d for d in dc if d.device_type == caffe2_pb2.CPU] + + if hiputl.run_in_hip(gc, dc) and order == "NHWC": + engine = "" + + op = core.CreateOperator( + "ConvTranspose", + ["X", "w", "b"] if use_bias else ["X", "w"], + ["Y"], + stride=stride, + kernel=kernel, + pad=pad, + adj=adj, + group=group, + order=order, + engine=engine, + shared_buffer=int(shared_buffer), + device_option=gc, + ) + + input_channels *= group + output_channels *= group + + X = np.random.rand( + batch_size, size, size, input_channels).astype(np.float32) - 0.5 + w = np.random.rand( + input_channels, kernel, kernel, int(output_channels / group)) \ + .astype(np.float32) - 0.5 + b = np.random.rand(output_channels).astype(np.float32) - 0.5 + if order == "NCHW": + X = utils.NHWC2NCHW(X) + w = utils.NHWC2NCHW(w) + + inputs = [X, w, b] if use_bias else [X, w] + self.assertDeviceChecks(dc, op, inputs, [0]) + for i in range(len(inputs)): + self.assertGradientChecks(gc, op, inputs, i, [0]) + + if __name__ == "__main__": import unittest unittest.main() |