Don't initialize a new `std::vector` in a loop. (#15850)

Summary: Before this diff, we execute `std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});` in every iteration of `foreach_reduced_elt`. Change the code to only execute that line if we need it; i.e., we are actually about to parallelize. This overhead is quite significant when we are doing a lot of small reductions in single-threaded code. ``` x=torch.randn((1024,10,1024),dtype=torch.float64) torch.set_num_threads(1) %timeit x.std(1) ``` Before (with #15845 applied): 708.25 ms After: 508 ms Pull Request resolved: https://github.com/pytorch/pytorch/pull/15850 Differential Revision: D13612960 Pulled By: umanwizard fbshipit-source-id: f5e61abfe0027775c97ed81ac09c997fbee741df
author: Brennan Vincent <btv@fb.com> 2019-01-28 08:47:35 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2019-01-28 08:50:27 -0800
commit: 14138f4605bf4ddd24808c9eb4890a11c0b789dd (patch)
tree: 7a9174c7889a3bd0fab9c680ce2dd8318affcd25 /aten
parent: d2cdffaf37a732ce14964e23c5fb139c636c2934 (diff)
download: pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.gz
pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.bz2
pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.zip
1 files changed, 25 insertions, 19 deletions
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 3cab114d63..c0d553e2dd 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -83,36 +83,42 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
     "the accumulate type must be default-constructible"
   );
   iter.foreach_reduced_elt([&](TensorIterator &sub_iter) {
-    auto numel = sub_iter.numel();
-    bool serial = numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region();
-    int max_threads = serial ? 1 : at::get_max_threads();
-    AT_ASSERT(max_threads > 0);
-    std::vector<optional<acc_t>> buffer((unsigned)max_threads, optional<acc_t> {});
-    at::parallel_for(0, numel, serial ? (1 + numel) : internal::GRAIN_SIZE,
-    [&](int64_t begin, int64_t end) {
-      auto &acc = buffer[at::get_thread_num()];
+    auto reduction_body = [&](acc_t acc, int64_t begin, int64_t end) -> acc_t {
       sub_iter.serial_for_each([&acc, &ops, &init](int ntensors, char** data, const int64_t* strides, int64_t size) {
         AT_ASSERT(ntensors == 2);
         char *in = data[1];
         int64_t stride = strides[1];
-        if (!acc && size > 0) {
-          //acc = acc_t {};
-          acc = init;
-        }
         for (int64_t i = 0; i < size; ++i) {
-          acc = ops.reduce(*acc, *(data_t*)in);
+          acc = ops.reduce(acc, *(data_t*)in);
           in += stride;
         }
       }, {begin, end});
-    });
-    acc_t acc = init;
-    for (int i = 0; i < max_threads; ++i) {
-      if (buffer[i]) {
-        acc = ops.combine(acc, *buffer[i]);
+      return acc;
+    };
+    acc_t total_acc = init;
+    auto numel = sub_iter.numel();
+    if (numel < at::internal::GRAIN_SIZE || at::get_max_threads() == 1 || at::in_parallel_region()) {
+      total_acc = reduction_body(total_acc, 0, numel);
+    } else {
+      int max_threads = at::get_max_threads();
+      AT_ASSERT(max_threads > 0);
+      static_assert(
+        !std::is_same<acc_t, bool>::value,
+        "Concurrently modifying different references into std::vector<bool> is UB."
+      );
+      std::vector<acc_t> buffer((unsigned)max_threads, init);
+      at::parallel_for(0, numel, internal::GRAIN_SIZE,
+        [&](int64_t begin, int64_t end) {
+          auto& acc = buffer[at::get_thread_num()];
+          acc = reduction_body(acc, begin, end);
+        }
+      );
+      for (int i = 0; i < max_threads; ++i) {
+        total_acc = ops.combine(total_acc, buffer[i]);
       }
     }
     char *out = (char *)sub_iter.data_ptr(0);
-    *(data_t*)out = ops.project(acc);
+    *(data_t*)out = ops.project(total_acc);
   });
 }
author	Brennan Vincent <btv@fb.com>	2019-01-28 08:47:35 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2019-01-28 08:50:27 -0800
commit	14138f4605bf4ddd24808c9eb4890a11c0b789dd (patch)
tree	7a9174c7889a3bd0fab9c680ce2dd8318affcd25 /aten
parent	d2cdffaf37a732ce14964e23c5fb139c636c2934 (diff)
download	pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.gz pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.tar.bz2 pytorch-14138f4605bf4ddd24808c9eb4890a11c0b789dd.zip