diff options
author | Thomas Viehmann <tv.github@beamnet.de> | 2018-04-16 20:41:47 +0200 |
---|---|---|
committer | Edward Z. Yang <ezyang@mit.edu> | 2018-04-16 14:41:47 -0400 |
commit | 40592f91b5f045c02443e9d390491bba7f5dcf46 (patch) | |
tree | 6a9a9918bc0ea0b81c2591f9cfa38d056f810f32 | |
parent | 24b49314625e13f98cd761dec939db1d825420d9 (diff) | |
download | pytorch-40592f91b5f045c02443e9d390491bba7f5dcf46.tar.gz pytorch-40592f91b5f045c02443e9d390491bba7f5dcf46.tar.bz2 pytorch-40592f91b5f045c02443e9d390491bba7f5dcf46.zip |
Fix bilinear performance regression (#6110)
The current implementation of bilinar uses a matrix multiplication approach. This creates a large intermediate matrix (batch * output dimension * input dimension). Relative to the previous pure python approach, this caused severe performance regression (600ms vs. 18ms for 300x100x200 weights and a batch of 50 on CPU, and also quadratic memory).
The attached change restores the performance using the previous strategy of looping over output features. It implements forward, backward, and double backward as native ATen code.
Credits:
Martin Tutek reported the regression and pinpointed the problem
Adam Paszke patiently answered my questions about ATen
I would not have been able to prepare this without you, thank you!
I referenced the old python implementation, used a python version of the naive implementation, and coded manual functions etc.
The tests have gradgradcheck etc.
* fix memory use of native bilinear
* bilinear double backward
* Move bilinear_double_backward to Functions.cpp
Addresses review comment by Tongzhou Wang. Thank you!
* add WrapDimUtilsMulti.h
* start at generic trilinear
* move to generic trilinear
* catch up on dim_list_to_bitset
* switch bilinear to use _trilinear implement _trilinear_backward
* add comments to Linear.cpp, move _trilinear in yaml
-rw-r--r-- | aten/src/ATen/WrapDimUtilsMulti.h | 26 | ||||
-rw-r--r-- | aten/src/ATen/native/Linear.cpp | 171 | ||||
-rw-r--r-- | aten/src/ATen/native/native_functions.yaml | 3 | ||||
-rw-r--r-- | tools/autograd/derivatives.yaml | 2 | ||||
-rw-r--r-- | tools/autograd/templates/Functions.cpp | 13 |
5 files changed, 209 insertions, 6 deletions
diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h new file mode 100644 index 0000000000..50130af52b --- /dev/null +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -0,0 +1,26 @@ +#pragma once + +#include "ATen/TensorImpl.h" +#include "ATen/WrapDimUtils.h" +#include <sstream> +#include <bitset> + +namespace at { + +// This is in an extra file to work around strange interaction of +// bitset on Windows with operator overloading + +constexpr size_t dim_bitset_size = 64; + +static inline std::bitset<dim_bitset_size> dim_list_to_bitset(IntList dims, int64_t ndims, bool wrap_scalar=true) { + AT_ASSERT(ndims <= (int64_t) dim_bitset_size, "only tensors with up to %zu dims are supported", dim_bitset_size); + std::bitset<dim_bitset_size> seen; + for (size_t i = 0; i < dims.size(); i++) { + size_t dim = maybe_wrap_dim(dims[i], ndims); + AT_ASSERT(!seen[dim], "dim %zu appears multiple times in the list of reduced dims", dim); + seen[dim] = true; + } + return seen; +} + +} diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index f000e15653..ac89eebd51 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -1,9 +1,166 @@ #include "ATen/ATen.h" #include "ATen/NativeFunctions.h" - +#include "ATen/WrapDimUtilsMulti.h" namespace at { namespace native { +// sumproduct_pair computes `(left*right).sum(sumdims)` by means of permutation and +// batch matrix multiplication // its main purpose is to provide a pairwise +// reduction for einsum +Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntList sum_dims_, bool keepdim) { + // assumes that tensors have been pre-unsqueezed + AT_ASSERT(left_.dim()==right_.dim(), "number of dimensions must match"); + if (sum_dims_.size() == 0) + return at::mul(left_, right_); + int64_t dim = left_.dim(); + auto sum_dims = dim_list_to_bitset(sum_dims_, dim); + std::vector<int64_t> lro, lo, ro; + int64_t lro_size = 1, lo_size = 1, ro_size = 1, sum_size = 1; + Tensor left = left_; + Tensor right = right_; + for (int64_t i = 0; i < dim; i++) { + auto sl = left.size(i)>1; + auto sr = right.size(i)>1; + if (sum_dims[i]) { + if (sl && sr) { + AT_ASSERT(left.size(i)==right.size(i), "sum indexes must match"); + sum_size *= left.size(i); + } else if (sl) { + left = left.sum(i, true); + } else if (sr) { + right = right.sum(i, true); + } + } else if (sl && sr) { + AT_ASSERT(left.size(i)==right.size(i), "non-broadcast dimensions must match"); + lro.push_back(i); + lro_size *= left.size(i); + } else if (sl) { + lo.push_back(i); + lo_size *= left.size(i); + } else { + ro.push_back(i); + ro_size *= right.size(i); + } + } + std::vector<int64_t> out_size; + for (auto& d : lro) out_size.push_back(left.size(d)); + for (auto& d : lo) out_size.push_back(left.size(d)); + for (auto& d : sum_dims_) { out_size.push_back(1); (void)(d); }; // avoid warining about not using d + for (auto& d : ro) out_size.push_back(right.size(d)); + + std::vector<int64_t> lpermutation(lro); + lpermutation.insert(lpermutation.end(), lo.begin(), lo.end()); + lpermutation.insert(lpermutation.end(), sum_dims_.begin(), sum_dims_.end()); + lpermutation.insert(lpermutation.end(), ro.begin(), ro.end()); + + std::vector<int64_t> rpermutation(lro); + rpermutation.insert(rpermutation.end(), sum_dims_.begin(), sum_dims_.end()); + rpermutation.insert(rpermutation.end(), ro.begin(), ro.end()); + rpermutation.insert(rpermutation.end(), lo.begin(), lo.end()); + + std::vector<int64_t> opermutation(lro.size()+lo.size()+sum_dims_.size()+ro.size(), -1); + { + int64_t i = 0; + + for (auto it = lro.begin(); it != lro.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = lo.begin(); it != lo.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = sum_dims_.begin(); it != sum_dims_.end(); i++, it++) { + opermutation[*it] = i; + } + for (auto it = ro.begin(); it != ro.end(); i++, it++) { + opermutation[*it] = i; + } + } + + left = left.permute(lpermutation).reshape({lro_size, lo_size, sum_size}); + right = right.permute(rpermutation).reshape({lro_size, sum_size, ro_size}); + Tensor result = at::bmm(left, right); + result = result.view(out_size).permute(opermutation); + if (! keepdim) { + for (int i = dim-1; i>=0; i--) + if (sum_dims[i]) + result.squeeze_(i); + } + return result; +} + +// _trilinear computes a trilinear einstein sum with an unrolled dimension +// the result is `(i1.unsqueeze(expand1)*i2.unsqueeze(expand2)*i2.unsqueeze(expand3)).sum(sumdim)` +// the computation is unrolled in the unroll_dim dimension +// its main purpose is to unify the computations in bilinear and bilinear_backward +Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_, + IntList expand1_, IntList expand2_, IntList expand3_, + IntList sumdim_, int64_t unroll_dim) { + int64_t total_dim = i1_.dim()+expand1_.size(); + AT_ASSERT((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,%zd]", total_dim-1); + auto expand1 = dim_list_to_bitset(expand1_, total_dim); + auto expand2 = dim_list_to_bitset(expand2_, total_dim); + auto expand3 = dim_list_to_bitset(expand3_, total_dim); + auto sumdim = dim_list_to_bitset(sumdim_, total_dim); + Tensor i1 = i1_; + Tensor i2 = i2_; + Tensor i3 = i3_; + std::vector<int64_t> output_size; + std::vector<int64_t> sum_dims_12, sum_dims_23; + int64_t unroll_size = -1; + // asserts... + for (int64_t i = 0; i < total_dim; i++) { + int64_t s = 0; + if (expand1[i]) { + i1 = i1.unsqueeze(i); + } else { + s = i1.size(i); + } + if (expand2[i]) { + i2 = i2.unsqueeze(i); + } else { + s = i2.size(i); + } + if (expand3[i]) { + i3 = i3.unsqueeze(i); + if (sumdim[i] && (i != unroll_dim)) + sum_dims_12.push_back(i); + } else { + s = i3.size(i); + if (sumdim[i] && (i != unroll_dim)) + sum_dims_23.push_back(i); + } + output_size.push_back(sumdim[i] ? 1 : s); + if (i == unroll_dim) + unroll_size = s; + } + int64_t slicemul1 = (expand1[unroll_dim] ? 0 : 1); + int64_t slicemul2 = (expand2[unroll_dim] ? 0 : 1); + int64_t slicemul3 = (expand3[unroll_dim] ? 0 : 1); + + auto output = i1.type().tensor(output_size).zero_(); + if (! sumdim[unroll_dim]) { + for (int64_t k = 0; k < unroll_size; k++) { + Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1), + i2.narrow(unroll_dim, k * slicemul2, 1), + sum_dims_12, true); + buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true); + output.narrow(unroll_dim, k, 1).add_(buf); + } + } + else { + for (int64_t k = 0; k < unroll_size; k++) { + Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1), + i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true); + buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true); + output.add_(buf); + } + } + for (int64_t i = output.dim()-1; i >= 0; i--) + if (sumdim[i]) + output.squeeze_(i); + return output; +} + Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight, const Tensor& bias) { AT_ASSERT(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got %lld and %lld", (long long)input1.dim(), (long long)input2.dim()); @@ -22,11 +179,13 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight "bilinear(): bias size does not match weight size: got %lld but expected %lld", (long long)bias.size(0), (long long)weight.size(0)); - auto b_input1 = input1.unsqueeze(-2).unsqueeze(-2); - auto b_input2 = input2.unsqueeze(-2).unsqueeze(-1); - - auto output = at::matmul(at::matmul(b_input1, weight), b_input2); - output = output.squeeze(-1).squeeze(-1); + std::vector<int64_t> output_size; + auto size1 = input1.sizes(); + output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); + output_size.push_back(weight.size(0)); + auto input1_flattened = input1.view({-1, input1.size(-1)}); + auto input2_flattened = input2.view({-1, input2.size(-1)}); + Tensor output = at::_trilinear(input1_flattened, weight, input2_flattened, {1,3}, {0}, {1,2}, {2,3}).reshape(output_size); if (bias.defined()) { output = output + bias; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 6533c8248d..47214e0074 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -695,6 +695,9 @@ - func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor variants: method +- func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor + variants: function + - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, bool size_average=true, bool reduce=true) -> Tensor variants: function diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 1e256c3dda..60e785ad38 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -688,6 +688,8 @@ self: not_implemented("_standard_gamma_grad") # NN +- name: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim) + i1, i2, i3: _trilinear_backward(grad, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim, grad_input_mask) - name: binary_cross_entropy_forward(Tensor self, Tensor target, Tensor weight, bool size_average, bool reduce) self: binary_cross_entropy_backward(grad, self, target, weight, size_average, reduce) diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp index 15fd0ede61..1bf7986f01 100644 --- a/tools/autograd/templates/Functions.cpp +++ b/tools/autograd/templates/Functions.cpp @@ -1268,6 +1268,19 @@ std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward( } +std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3, + IntList expand1, IntList expand2, IntList expand3, + IntList sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask) { + Tensor grad_i1, grad_i2, grad_i3; + if (grad_mask[0]) + grad_i1 = at::_trilinear(grad_out, i2, i3, sumdim, expand2, expand3, expand1); + if (grad_mask[1]) + grad_i2 = at::_trilinear(i1, grad_out, i3, expand1, sumdim, expand3, expand2); + if (grad_mask[2]) + grad_i3 = at::_trilinear(i1, i2, grad_out, expand1, expand2, sumdim, expand3); + return std::tuple<Tensor, Tensor, Tensor>(grad_i1, grad_i2, grad_i3); +} + } // anonymous namespace ${autograd_function_definitions} |