summaryrefslogtreecommitdiff
path: root/aten
diff options
context:
space:
mode:
authorBrennan Vincent <btv@fb.com>2019-01-28 08:47:35 -0800
committerFacebook Github Bot <facebook-github-bot@users.noreply.github.com>2019-01-28 08:50:27 -0800
commit14138f4605bf4ddd24808c9eb4890a11c0b789dd (patch)
tree7a9174c7889a3bd0fab9c680ce2dd8318affcd25 /aten
parentd2cdffaf37a732ce14964e23c5fb139c636c2934 (diff)
downloadpytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.gz
pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.bz2
pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.zip
Don't initialize a new `std::vector` in a loop. (#15850)
Summary: Before this diff, we execute `std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});` in every iteration of `foreach_reduced_elt`. Change the code to only execute that line if we need it; i.e., we are actually about to parallelize. This overhead is quite significant when we are doing a lot of small reductions in single-threaded code. ``` x=torch.randn((1024,10,1024),dtype=torch.float64) torch.set_num_threads(1) %timeit x.std(1) ``` Before (with #15845 applied): 708.25 ms After: 508 ms Pull Request resolved: https://github.com/pytorch/pytorch/pull/15850 Differential Revision: D13612960 Pulled By: umanwizard fbshipit-source-id: f5e61abfe0027775c97ed81ac09c997fbee741df
Diffstat (limited to 'aten')
-rw-r--r--aten/src/ATen/native/cpu/Reduce.h44
1 files changed, 25 insertions, 19 deletions
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 3cab114d63..c0d553e2dd 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -83,36 +83,42 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
"the accumulate type must be default-constructible"
);
iter.foreach_reduced_elt([&](TensorIterator &sub_iter) {
- auto numel = sub_iter.numel();
- bool serial = numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region();
- int max_threads = serial ? 1 : at::get_max_threads();
- AT_ASSERT(max_threads > 0);
- std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});
- at::parallel_for(0, numel, serial ? (1 + numel) : internal::GRAIN_SIZE,
- [&](int64_t begin, int64_t end) {
- auto &acc = buffer[at::get_thread_num()];
+ auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t {
sub_iter.serial_for_each([&acc, &ops, &init](int ntensors, char** data, const int64_t* strides, int64_t size) {
AT_ASSERT(ntensors == 2);
char *in = data[1];
int64_t stride = strides[1];
- if (!acc && size > 0) {
- //acc = acc_t {};
- acc = init;
- }
for (int64_t i = 0; i < size; ++i) {
- acc = ops.reduce(*acc, *(data_t*)in);
+ acc = ops.reduce(acc, *(data_t*)in);
in += stride;
}
}, {begin, end});
- });
- acc_t acc = init;
- for (int i = 0; i < max_threads; ++i) {
- if (buffer[i]) {
- acc = ops.combine(acc, *buffer[i]);
+ return acc;
+ };
+ acc_t total_acc = init;
+ auto numel = sub_iter.numel();
+ if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) {
+ total_acc = reduction_body(total_acc, 0, numel);
+ } else {
+ int max_threads = at::get_max_threads();
+ AT_ASSERT(max_threads > 0);
+ static_assert(
+ !std::is_same<acc_t, bool>::value,
+ "Concurrently modifying different references into std::vector<bool> is UB."
+ );
+ std::vector<acc_t> buffer((unsigned)max_threads, init);
+ at::parallel_for(0, numel, internal::GRAIN_SIZE,
+ [&](int64_t begin, int64_t end) {
+ auto& acc = buffer[at::get_thread_num()];
+ acc = reduction_body(acc, begin, end);
+ }
+ );
+ for (int i = 0; i < max_threads; ++i) {
+ total_acc = ops.combine(total_acc, buffer[i]);
}
}
char *out = (char *)sub_iter.data_ptr(0);
- *(data_t*)out = ops.project(acc);
+ *(data_t*)out = ops.project(total_acc);
});
}