diff options
author | Jerry Zhang <jerryzh@fb.com> | 2019-01-30 12:37:55 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-01-30 12:50:38 -0800 |
commit | 2af95d8e3e07515b4546edd49c251f47211e1768 (patch) | |
tree | 90c4f7f29a65727b7862f806942d17bb307b0c97 | |
parent | cdbd3882066f809531cb0250b5b8ad73b9549ccd (diff) | |
download | pytorch-2af95d8e3e07515b4546edd49c251f47211e1768.tar.gz pytorch-2af95d8e3e07515b4546edd49c251f47211e1768.tar.bz2 pytorch-2af95d8e3e07515b4546edd49c251f47211e1768.zip |
Back out "[pt1][tensor] Change ConvPoolOp<Context>::SetOutputSize to ConvPoolOp<Context>::GetOutputSize" (#16516)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16516
Original commit changeset: 64abce3dbaed
Reviewed By: dzhulgakov
Differential Revision: D13863715
fbshipit-source-id: f1923fdca4a1a82768d9c280a8493ff15a7eb2ba
29 files changed, 95 insertions, 115 deletions
diff --git a/caffe2/contrib/nnpack/nnpack_ops.cc b/caffe2/contrib/nnpack/nnpack_ops.cc index e6e82f4829..23e52a152e 100644 --- a/caffe2/contrib/nnpack/nnpack_ops.cc +++ b/caffe2/contrib/nnpack/nnpack_ops.cc @@ -119,6 +119,7 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> { auto& X = Input(0); auto& filter = Input(1); auto& bias = Input(2); + auto* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); const int M = filter.dim32(0); @@ -132,8 +133,7 @@ class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> { CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), ""); CAFFE_ENFORCE(bias.numel() == M, ""); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); const int oH = Y->dim32(2), oW = Y->dim32(3); if (N > 1) { @@ -250,10 +250,10 @@ class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> { bool RunOnDeviceWithOrderNCHW() override { auto& X = Input(0); + auto* Y = Output(0); CAFFE_ENFORCE(X.dim() == 4, ""); const int H = X.dim32(2), W = X.dim32(3); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1)); std::vector<int> pads( {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()}); std::vector<int> stride({this->stride_h(), this->stride_w()}); diff --git a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc index ab52d0a070..c27e7e57c0 100644 --- a/caffe2/cuda_rtc/pool_op_rtc_gpu.cc +++ b/caffe2/cuda_rtc/pool_op_rtc_gpu.cc @@ -196,8 +196,8 @@ class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> { bool RunOnDeviceWithOrderNCHW() override { auto& X = Input(0); - auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, output_sizes, at::dtype<float>()); + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); if (input_dims_ != X.sizes()) { // recompile diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index 7409c273e0..4cd912e2f2 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -257,10 +257,11 @@ void computeOutputHW( int* OH, int* OW) { Tensor input = caffe2::empty({1, 1, H, W}, at::dtype<float>().device(CPU)); - auto sizes = op->GetOutputSize(input, 1); - CAFFE_ENFORCE_EQ(sizes.size(), 4); - *OH = sizes[2]; - *OW = sizes[3]; + Tensor output(CPU); + op->SetOutputSize(input, &output, 1); + CAFFE_ENFORCE_EQ(output.dim(), 4); + *OH = output.size(2); + *OW = output.size(3); } constexpr int computeMPSAlignOffset(int kernel, int pad) { diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc index 4e7c081b76..9c96536849 100644 --- a/caffe2/operators/conv_op_cudnn.cc +++ b/caffe2/operators/conv_op_cudnn.cc @@ -516,13 +516,13 @@ template <typename T_X, typename T_W, typename T_B, typename T_Y> bool CudnnConvOp::DoRunWithType() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); // Figure out the output shape CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5); CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5); const int M = filter.dim32(0); - auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype<T_Y>()); + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M); int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0; int group_offset_X = 0, group_offset_Y = 0; diff --git a/caffe2/operators/conv_op_eigen.cc b/caffe2/operators/conv_op_eigen.cc index 5a0412396a..c68de09115 100644 --- a/caffe2/operators/conv_op_eigen.cc +++ b/caffe2/operators/conv_op_eigen.cc @@ -34,14 +34,14 @@ template <typename T> bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); CAFFE_ENFORCE(4 == filter.dim()); const int M = filter.dim32(0); CAFFE_ENFORCE(filter.dim32(1) == C); CAFFE_ENFORCE(filter.dim32(2) == kernel_h()); CAFFE_ENFORCE(filter.dim32(3) == kernel_w()); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); Eigen::array<int64_t, 4> kernel_shuffles { {int64_t(2), int64_t(3), int64_t(1), int64_t(0)} }; Eigen::array<int64_t, 4> input_shuffles @@ -128,14 +128,14 @@ template <typename T> bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() { auto& X = Input(INPUT); auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3); CAFFE_ENFORCE(4 == filter.dim()); const int M = filter.dim32(0); CAFFE_ENFORCE(filter.dim32(1) == kernel_h()); CAFFE_ENFORCE(filter.dim32(2) == kernel_w()); CAFFE_ENFORCE(filter.dim32(3) == C); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); // Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for // optimization purposes, so we will create a temp one. Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter( diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h index fc5300f8dc..29d98c0b78 100644 --- a/caffe2/operators/conv_op_impl.h +++ b/caffe2/operators/conv_op_impl.h @@ -21,6 +21,7 @@ template <typename T, class Context> bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); const int N = X.dim32(0); const int C = X.dim32(1); const int G = group_; @@ -43,8 +44,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() { CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); kernel_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, M); const vector<int> X_dims = GetDims(X); const vector<int> Y_dims = GetDims(*Y); const int X_HxW = X.numel() / (N * C); @@ -190,6 +190,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() { "Only 1-3d convolution is supported for NHWC storage type"); const Tensor& X = Input(INPUT); const auto& filter = Input(FILTER); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(X.dim() - 1); const int G = group_; CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); @@ -211,8 +212,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); kernel_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, M); - auto* Y = Output(0, output_sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, M); const vector<int> Y_dims = GetDims(*Y); const int X_HxW = X.numel() / (N * C); const int Y_HxW = Y->numel() / (N * M); diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index 392d813dcf..b591d3dd46 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -207,7 +207,7 @@ class ConvPoolOpBase : public Operator<Context> { return size; } - // Gets the output size. The output channel is manually provided since + // Sets the output size. The output channel is manually provided since // it may not be identical to the input channels. // This function can be used in the forward functions to obtain the output // sizes. @@ -215,7 +215,8 @@ class ConvPoolOpBase : public Operator<Context> { // implementations that do not use first-class Tensor objects, such as the // MKL operator. One can still call this function with dummy // Tensor objects in order to obtain the sizes. - std::vector<int64_t> GetOutputSize(const Tensor& input, int output_channel) { + // TODO: passing sizes directly rather than Tensor + void SetOutputSize(const Tensor& input, Tensor* output, int output_channel) { CAFFE_ENFORCE(input.numel() > 0); vector<int> output_dims; int N = input.dim32(0); @@ -240,7 +241,7 @@ class ConvPoolOpBase : public Operator<Context> { output_dims.insert(output_dims.begin(), N); output_dims.push_back(output_channel); } - return std::vector<int64_t>(output_dims.cbegin(), output_dims.cend()); + output->Resize(output_dims); } // Helper function that is also called from OperatorSchema. Modified diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h index 9bf9df5132..94dea27f64 100644 --- a/caffe2/operators/deform_conv_op_impl.h +++ b/caffe2/operators/deform_conv_op_impl.h @@ -17,6 +17,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() { const Tensor& X = Input(INPUT); const Tensor& offset = Input(OFFSET); auto& filter = Input(FILTER); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.dim(), filter.ndim()); const int M = filter.dim32(0); @@ -81,8 +82,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() { kernel_dims_size *= kernel_[i]; } - auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, filter.dim32(0)); - auto* Y = Output(0, output_sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0)); const vector<int> input_dims = GetDims(X); const vector<int> output_dims = GetDims(*Y); @@ -196,8 +196,8 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() { auto& offset = Input(OFFSET); auto& filter = Input(FILTER); auto& dY = Input(OUTPUT_GRAD); - - + + const int N = X.dim32(0), C = X.dim32(1); const vector<int> input_dims = this->GetDims(X); @@ -303,7 +303,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() { T* dbias_data = nullptr; if (!no_bias_) { - + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>()); if (bias_multiplier_.size() != output_image_size) { // If the helper bias multiplier is not M, reshape and fill it with one. @@ -323,7 +323,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() { T* dXdata = nullptr; if (OutputSize() == 4 || (no_bias_ && (OutputSize() == 3))) { - + auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>()); dXdata = dX->template mutable_data<T>(); math::Set<T, Context>(dX->size(), 0, dXdata, &context_); diff --git a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu index f4b530ea7f..beefcfdd66 100644 --- a/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu +++ b/caffe2/operators/depthwise_3x3_conv_op_cudnn.cu @@ -288,6 +288,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> { bool RunOnDeviceWithOrderNCHW() override { const Tensor& X = Input(0); auto& filter = Input(1); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -299,8 +300,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CUDAContext> { CAFFE_ENFORCE_EQ(this->kernel_w(), 3); CAFFE_ENFORCE_EQ(this->kernel_h(), 3); CAFFE_ENFORCE_EQ(this->stride_h(), this->stride_w()); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, filter.dim32(0)); DepthwiseArgs args; args.batch = X.dim32(0); args.in_rows = X.dim32(2); @@ -455,7 +455,7 @@ class Depthwise3x3ConvGradientOp final : public ConvPoolOpBase<CUDAContext> { M, dY.dim32(2), dY.dim32(3))); - + auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<float>()); CUDNN_ENFORCE(cudnnConvolutionBackwardBias( cudnn_wrapper_.inline_cudnn_handle(), diff --git a/caffe2/operators/hip/conv_op_miopen.hip b/caffe2/operators/hip/conv_op_miopen.hip index fe4320bc09..376b49aa2a 100644 --- a/caffe2/operators/hip/conv_op_miopen.hip +++ b/caffe2/operators/hip/conv_op_miopen.hip @@ -205,6 +205,7 @@ template <typename T_X, typename T_W, typename T_B, typename MATH, typename T_Y> bool MIOPENConvOp::DoRunWithType() { auto& X = Input(INPUT); auto& Weight = Input(FILTER); + auto* Y = Output(0); // Figure out the output shape CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5); @@ -213,8 +214,7 @@ bool MIOPENConvOp::DoRunWithType() { "Conv op with MIOpen engine is supported only for 2D convolutions"); const int M = Weight.dim32(0); - auto sizes = ConvPoolOpBase<HIPContext>::GetOutputSize(X, M); - auto* Y = Output(0, sizes, at::dtype<T_Y>()); + ConvPoolOpBase<HIPContext>::SetOutputSize(X, Y, M); int N = X.dim32(0); int C = X.dim32(1); diff --git a/caffe2/operators/hip/pool_op_miopen.hip b/caffe2/operators/hip/pool_op_miopen.hip index c1d5ee387b..614b6cf09b 100644 --- a/caffe2/operators/hip/pool_op_miopen.hip +++ b/caffe2/operators/hip/pool_op_miopen.hip @@ -61,6 +61,7 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> { template <typename T, typename M> bool DoRunWithType() { auto& X = Input(0); + auto* Y = Output(0); int N = 0, C = 0, H = 0, W = 0, D = 0; int N_out = 0, C_out = 0, H_out = 0, W_out = 0; CAFFE_ENFORCE(X.ndim() >= 4 && X.ndim() <= 5); @@ -68,8 +69,7 @@ class MIOPENPoolOp : public ConvPoolOpBase<HIPContext> { C = X.dim32(1); H = X.dim32(2); W = X.ndim() > 3 ? X.dim32(3) : 1; - auto sizes = ConvPoolOpBase::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase::SetOutputSize(X, Y, C); N_out = Y->dim32(0); C_out = Y->dim32(1); diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h index 60e641eb63..df05cad403 100644 --- a/caffe2/operators/locally_connected_op_impl.h +++ b/caffe2/operators/locally_connected_op_impl.h @@ -20,6 +20,7 @@ template <typename T, class Context> bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); const int image_ndim = X.dim() - 2; CAFFE_ENFORCE_EQ(X.dim() + image_ndim, filter.dim()); lc_op_util::ShapeParams shape; @@ -40,8 +41,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto output_sizes = ConvPoolOpBase<Context>::GetOutputSize(X, shape.M); - auto* Y = Output(0, output_sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M); shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); const std::vector<int> output_image_dims = GetDims(*Y); @@ -109,6 +109,7 @@ template <typename T, class Context> bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() { const auto& X = Input(INPUT); const auto& filter = Input(FILTER); + auto* Y = Output(0); CAFFE_ENFORCE_EQ( kernel_.size(), 2, @@ -123,8 +124,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 1), kernel_h()); CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 2), kernel_w()); CAFFE_ENFORCE_EQ(filter.dim32(image_ndim + 3), shape.C); - auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, shape.M); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, shape.M); shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); diff --git a/caffe2/operators/lp_pool_op.cc b/caffe2/operators/lp_pool_op.cc index cfe9e12533..2b6aef179c 100644 --- a/caffe2/operators/lp_pool_op.cc +++ b/caffe2/operators/lp_pool_op.cc @@ -13,8 +13,8 @@ struct LpPoolFunctor { template <> bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); - auto sizes = ConvPoolOpBase::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype<float>()); + auto* Y = Output(0); + ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1)); const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); const auto inv_p = 1.0 / p; @@ -59,11 +59,11 @@ bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() { template <> bool PoolOp<float, CPUContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); int height = X.dim32(1); int width = X.dim32(2); int channels = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const auto p = OperatorBase::GetSingleArgument<float>("p", 2.0); const auto inv_p = 1.0 / p; diff --git a/caffe2/operators/lp_pool_op.cu b/caffe2/operators/lp_pool_op.cu index 8380be1b92..1547b002f3 100644 --- a/caffe2/operators/lp_pool_op.cu +++ b/caffe2/operators/lp_pool_op.cu @@ -215,9 +215,8 @@ __global__ void LpPoolBackwardNHWC( template <> bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype<float>()); - + auto* Y = Output(0); + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1)); int output_size = Y->size(); LpPoolForwardNCHW<float> <<<CAFFE_GET_BLOCKS(output_size), @@ -246,9 +245,8 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() { template <> bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(3)); - auto* Y = Output(0, sizes, at::dtype<float>()); - + auto* Y = Output(0); + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(3)); int output_size = Y->size(); LpPoolForwardNHWC<float> <<<CAFFE_GET_BLOCKS(output_size), diff --git a/caffe2/operators/max_pool_with_index.cu b/caffe2/operators/max_pool_with_index.cu index 3afa1b1bd1..c5bbb0b976 100644 --- a/caffe2/operators/max_pool_with_index.cu +++ b/caffe2/operators/max_pool_with_index.cu @@ -108,11 +108,10 @@ __global__ void MaxPoolBackward( template <typename T> bool MaxPoolWithIndexOp::DoRunWithType() { auto& X = Input(0); + auto* Y = Output(0); auto* mask = Output(1); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = Output(0, sizes, at::dtype<T>()); - + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, X.dim32(1)); int output_size = Y->size(); mask->Resize(output_size); diff --git a/caffe2/operators/pad_op.cc b/caffe2/operators/pad_op.cc index 3813fc29c6..2dca106e59 100644 --- a/caffe2/operators/pad_op.cc +++ b/caffe2/operators/pad_op.cc @@ -22,11 +22,11 @@ using std::max; template <> bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); + auto* Y = Output(0); int channels = X.dim32(1); int height = X.dim32(2); int width = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const float* Xdata = X.data<float>(); float* Ydata = Y->template mutable_data<float>(); @@ -160,11 +160,11 @@ bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() { template <> bool PadImageOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); int height = X.dim32(1); int width = X.dim32(2); int channels = X.dim32(3); - auto sizes = ConvPoolOpBase::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase::SetOutputSize(X, Y, channels); const float* Xdata = X.data<float>(); float* Ydata = Y->template mutable_data<float>(); diff --git a/caffe2/operators/pad_op_gpu.cu b/caffe2/operators/pad_op_gpu.cu index a9fcb87907..b36a6b25a3 100644 --- a/caffe2/operators/pad_op_gpu.cu +++ b/caffe2/operators/pad_op_gpu.cu @@ -251,13 +251,12 @@ __global__ void PadImageGradientEdgeNHWC( template <> bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() { auto& X = Input(0); + auto* Y = Output(0); const int num = X.dim32(0); const int channels = X.dim32(1); const int height = X.dim32(2); const int width = X.dim32(3); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype<float>()); - + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels); const int output_size = Y->size(); const int padded_height = Y->dim32(2); const int padded_width = Y->dim32(3); @@ -328,13 +327,12 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() { template<> bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() { auto& X = Input(0); + auto* Y = Output(0); const int num = X.dim32(0); const int height = X.dim32(1); const int width = X.dim32(2); const int channels = X.dim32(3); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, channels); - auto* Y = Output(0, sizes, at::dtype<float>()); - + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, channels); const int output_size = Y->size(); const int padded_height = Y->dim32(1); const int padded_width = Y->dim32(2); @@ -405,7 +403,7 @@ bool PadImageOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() { template<> bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() { auto& dY = Input(0); - + auto* dX = Output(0, { dY.dim32(0), dY.dim32(1), dY.dim32(2) - pad_t() - pad_b(), @@ -485,7 +483,7 @@ bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() { template<> bool PadImageGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() { auto& dY = Input(0); - + auto* dX = Output(0, { dY.dim32(0), dY.dim32(1) - pad_t() - pad_b(), dY.dim32(2) - pad_l() - pad_r(), diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h index 004ec6a1ed..909fe120ca 100644 --- a/caffe2/operators/pool_op.h +++ b/caffe2/operators/pool_op.h @@ -36,10 +36,10 @@ class PoolOp final : public ConvPoolOpBase<Context> { bool RunOnDeviceWithOrderNCHW() override { const auto& X = Input(0); + auto* Y = Output(0); const int N = X.dim32(0); const int C = X.dim32(1); - auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, C); const T* X_data = X.template data<T>(); T* Y_data = Y->template mutable_data<T>(); if (global_pooling_) { @@ -65,11 +65,11 @@ class PoolOp final : public ConvPoolOpBase<Context> { bool RunOnDeviceWithOrderNHWC() override { const auto& X = Input(0); + auto* Y = Output(0); const int ndim = X.ndim(); const int N = X.dim32(0); const int C = X.dim32(ndim - 1); - auto sizes = ConvPoolOpBase<Context>::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<Context>::SetOutputSize(X, Y, C); const T* X_data = X.template data<T>(); T* Y_data = Y->template mutable_data<T>(); if (global_pooling_) { diff --git a/caffe2/operators/pool_op_cudnn.cc b/caffe2/operators/pool_op_cudnn.cc index 95bf2d5567..1ed723ca4e 100644 --- a/caffe2/operators/pool_op_cudnn.cc +++ b/caffe2/operators/pool_op_cudnn.cc @@ -99,11 +99,11 @@ class CuDNNPoolOp final : public ConvPoolOpBase<CUDAContext> { template <typename T> bool DoRunWithType() { const auto& X = Input(0); + auto* Y = Output(0); const int ndim = X.ndim(); const int N = X.dim32(0); const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1); - auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, C); - auto* Y = Output(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, C); const T* X_data = X.template data<T>(); T* Y_data = Y->template mutable_data<T>(); diff --git a/caffe2/operators/quantized/int8_average_pool_op.h b/caffe2/operators/quantized/int8_average_pool_op.h index e81829e49e..1d542ce230 100644 --- a/caffe2/operators/quantized/int8_average_pool_op.h +++ b/caffe2/operators/quantized/int8_average_pool_op.h @@ -44,8 +44,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase<CPUContext> { CHECK_EQ(X.t.dim(), 4); const int channels = X.t.dim32(3); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, channels); - ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU)); + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels); initQNNPACK(); diff --git a/caffe2/operators/quantized/int8_conv_op.h b/caffe2/operators/quantized/int8_conv_op.h index 5f007f5654..51afa7a38e 100644 --- a/caffe2/operators/quantized/int8_conv_op.h +++ b/caffe2/operators/quantized/int8_conv_op.h @@ -43,8 +43,7 @@ class Int8ConvOp final : public ConvPoolOpBase<CPUContext> { this->template GetSingleArgument<int>("Y_zero_point", 0); double Y_scale = this->template GetSingleArgument<float>("Y_scale", 1); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, W.t.dim32(0)); - ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU)); + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), W.t.dim32(0)); Y->scale = Y_scale; Y->zero_point = Y_offset; diff --git a/caffe2/operators/quantized/int8_max_pool_op.h b/caffe2/operators/quantized/int8_max_pool_op.h index c1ab54da49..ce65bb49cc 100644 --- a/caffe2/operators/quantized/int8_max_pool_op.h +++ b/caffe2/operators/quantized/int8_max_pool_op.h @@ -42,8 +42,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase<CPUContext> { CHECK_EQ(X.t.dim(), 4); const int channels = X.t.dim32(3); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X.t, channels); - ReinitializeTensor(&(Y->t), sizes, at::dtype<uint8_t>().device(CPU)); + ConvPoolOpBase<CPUContext>::SetOutputSize(X.t, &(Y->t), channels); initQNNPACK(); diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc index b339e52240..b3713377be 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op.cc @@ -102,8 +102,8 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() { const Tensor& X = InputTensorCPU_(INPUT); int N = X.dim32(0); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>()); + Tensor* Y = OutputTensorCPU_(0); + this->SetOutputSize(X, Y, filter.dim32(0)); const int output_image_size = this->GetDimsSize(*Y); if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) { @@ -228,6 +228,7 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -245,8 +246,7 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>()); + this->SetOutputSize(X, Y, filter.dim32(0)); const vector<int> input_dims = GetDims(X); const vector<int> output_dims = GetDims(*Y); @@ -618,14 +618,14 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNHWC() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(X.ndim() - 1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_); - auto sizes = this->GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<uint8_t>()); + this->SetOutputSize(X, Y, filter.dim32(0)); // The dimension of each kernel const int kernel_dim = this->KernelDim_(); // The output image size is the spatial size of the output. diff --git a/caffe2/quantization/server/conv_dnnlowp_op.cc b/caffe2/quantization/server/conv_dnnlowp_op.cc index 366f235ba8..f4de399215 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op.cc +++ b/caffe2/quantization/server/conv_dnnlowp_op.cc @@ -559,6 +559,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); const int M = filter.dim32(0); @@ -576,8 +577,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() { 0, "The number of output channels is not divisible by group."); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); const vector<int> input_dims = GetDims(X); const vector<int> output_dims = GetDims(*Y); @@ -1417,6 +1417,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() { const Tensor& X = InputTensorCPU_(INPUT); auto& filter = InputTensorCPU_(FILTER); + Tensor* Y = OutputTensorCPU_(0); const int C = X.dim32(X.dim() - 1); const int G = group_; CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); @@ -1433,8 +1434,7 @@ bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ( M % G, 0, "The number of output channels is not divisible by group."); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); // The col buffer is stored in HWC order as well - kernel_dim, and the height // and width. diff --git a/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h b/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h index c82a2879f2..ef2df60814 100644 --- a/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h +++ b/caffe2/quantization/server/conv_pool_dnnlowp_op_base.h @@ -61,12 +61,6 @@ class ConvPoolDNNLowPOpBase : public ConvPoolOpBase<CPUContext> { return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t; } - Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) { - auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t; - ReinitializeTensor(t, dims, options.device(CPU)); - return t; - } - T* GetQuantizedOutputData_() { return OutputTensorCPU_(0)->template mutable_data<T>(); } diff --git a/caffe2/quantization/server/dnnlowp_op.h b/caffe2/quantization/server/dnnlowp_op.h index 715567abd3..d9414569a7 100644 --- a/caffe2/quantization/server/dnnlowp_op.h +++ b/caffe2/quantization/server/dnnlowp_op.h @@ -115,16 +115,6 @@ class DNNLowPOp : public Operator<CPUContext> { } } - Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) { - if (dequantize_output_) { - return Output(idx, dims, options.device(CPU)); - } else { - auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t; - ReinitializeTensor(t, dims, options.device(CPU)); - return t; - } - } - T* GetQuantizedOutputData_() { if (dequantize_output_) { out_temp_.resize(Output(0)->numel()); diff --git a/caffe2/quantization/server/pool_dnnlowp_op.cc b/caffe2/quantization/server/pool_dnnlowp_op.cc index 6b887f2350..bbf6026113 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op.cc +++ b/caffe2/quantization/server/pool_dnnlowp_op.cc @@ -100,8 +100,8 @@ class AveragePoolDnnLowPOp final GetOutputQuantizationParams_(); auto& X = InputTensorCPU_(0); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + auto* Y = OutputTensorCPU_(0); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1)); T* Ydata = GetQuantizedOutputData_(); @@ -238,9 +238,9 @@ class AveragePoolDnnLowPOp final GetOutputQuantizationParams_(); auto& X = InputTensorCPU_(0); + auto* Y = OutputTensorCPU_(0); int channels = X.dim32(X.ndim() - 1); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, channels); T* Ydata = GetQuantizedOutputData_(); @@ -397,8 +397,8 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> { const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); auto& X = InputTensorCPU_(0); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, X.dim32(1)); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + auto* Y = OutputTensorCPU_(0); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1)); T* Ydata = GetQuantizedOutputData_(); @@ -543,9 +543,9 @@ class MaxPoolDnnLowPOp final : public ConvPoolDNNLowPOpBase<T, MaxPoolFp32Op> { const T* Xdata = QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp); auto& X = InputTensorCPU_(0); + auto* Y = OutputTensorCPU_(0); int channels = X.dim32(X.ndim() - 1); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, channels); - auto* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, channels); T* Ydata = GetQuantizedOutputData_(); diff --git a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc index 33b1a983a2..7b1c662c6b 100644 --- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc +++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc @@ -442,6 +442,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> { bool RunOnDeviceWithOrderNCHW() override { const Tensor& X = Input(0); auto& filter = Input(1); + Tensor* Y = Output(0); const int N = X.dim32(0), C = X.dim32(1); CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim()); const int M = filter.dim32(0); @@ -451,8 +452,8 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> { CAFFE_ENFORCE_EQ(C, this->group_); CAFFE_ENFORCE_EQ(M, this->group_); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); + Y->mutable_data<float>(); DepthwiseArgs args; args.batch = X.dim32(0); diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc index dab207b02f..67dd180453 100644 --- a/caffe2/share/contrib/nnpack/conv_op.cc +++ b/caffe2/share/contrib/nnpack/conv_op.cc @@ -147,8 +147,10 @@ NNPACKConvOp::getActivationType() const { bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { /* Global variable with a unique ID of the pre-transformed kernel blob */ volatile static uint32_t precomputed_transform_id = 0; + auto& X = Input(0); auto& filter = Input(1); + auto* Y = Output(0); CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4"); const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3); CAFFE_ENFORCE(filter.ndim() == 4, ""); @@ -158,8 +160,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, ""); CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), ""); CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), ""); - auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); - Tensor* Y = Output(0, sizes, at::dtype<float>()); + ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0)); const int oH = Y->dim32(2), oW = Y->dim32(3); const float* biasData = NULL; |