diff options
author | Brennan Vincent <btv@fb.com> | 2019-01-28 08:47:35 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2019-01-28 08:50:27 -0800 |
commit | 14138f4605bf4ddd24808c9eb4890a11c0b789dd (patch) | |
tree | 7a9174c7889a3bd0fab9c680ce2dd8318affcd25 /aten | |
parent | d2cdffaf37a732ce14964e23c5fb139c636c2934 (diff) | |
download | pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.gz pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.bz2 pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.zip |
Don't initialize a new `std::vector` in a loop. (#15850)
Summary:
Before this diff, we execute `std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});` in every iteration of `foreach_reduced_elt`. Change the code to only execute that line if we need it; i.e., we are actually about to parallelize.
This overhead is quite significant when we are doing a lot of small reductions in single-threaded code.
```
x=torch.randn((1024,10,1024),dtype=torch.float64)
torch.set_num_threads(1)
%timeit x.std(1)
```
Before (with #15845 applied): 708.25 ms
After: 508 ms
Pull Request resolved: https://github.com/pytorch/pytorch/pull/15850
Differential Revision: D13612960
Pulled By: umanwizard
fbshipit-source-id: f5e61abfe0027775c97ed81ac09c997fbee741df
Diffstat (limited to 'aten')
-rw-r--r-- | aten/src/ATen/native/cpu/Reduce.h | 44 |
1 files changed, 25 insertions, 19 deletions
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index 3cab114d63..c0d553e2dd 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -83,36 +83,42 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { "the accumulate type must be default-constructible" ); iter.foreach_reduced_elt([&](TensorIterator &sub_iter) { - auto numel = sub_iter.numel(); - bool serial = numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region(); - int max_threads = serial ? 1 : at::get_max_threads(); - AT_ASSERT(max_threads > 0); - std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {}); - at::parallel_for(0, numel, serial ? (1 + numel) : internal::GRAIN_SIZE, - [&](int64_t begin, int64_t end) { - auto &acc = buffer[at::get_thread_num()]; + auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t { sub_iter.serial_for_each([&acc, &ops, &init](int ntensors, char** data, const int64_t* strides, int64_t size) { AT_ASSERT(ntensors == 2); char *in = data[1]; int64_t stride = strides[1]; - if (!acc && size > 0) { - //acc = acc_t {}; - acc = init; - } for (int64_t i = 0; i < size; ++i) { - acc = ops.reduce(*acc, *(data_t*)in); + acc = ops.reduce(acc, *(data_t*)in); in += stride; } }, {begin, end}); - }); - acc_t acc = init; - for (int i = 0; i < max_threads; ++i) { - if (buffer[i]) { - acc = ops.combine(acc, *buffer[i]); + return acc; + }; + acc_t total_acc = init; + auto numel = sub_iter.numel(); + if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) { + total_acc = reduction_body(total_acc, 0, numel); + } else { + int max_threads = at::get_max_threads(); + AT_ASSERT(max_threads > 0); + static_assert( + !std::is_same<acc_t, bool>::value, + "Concurrently modifying different references into std::vector<bool> is UB." + ); + std::vector<acc_t> buffer((unsigned)max_threads, init); + at::parallel_for(0, numel, internal::GRAIN_SIZE, + [&](int64_t begin, int64_t end) { + auto& acc = buffer[at::get_thread_num()]; + acc = reduction_body(acc, begin, end); + } + ); + for (int i = 0; i < max_threads; ++i) { + total_acc = ops.combine(total_acc, buffer[i]); } } char *out = (char *)sub_iter.data_ptr(0); - *(data_t*)out = ops.project(acc); + *(data_t*)out = ops.project(total_acc); }); } |