Reduce broadcasted inputs in derivative code (#14485)

Summary: Previously symbolic AD formulas assumed that no broadcasting happened, and would return gradients of incorrect shapes (possibly leading to silent errors later). Fixes a few bugs (known and unknown): - #11736 - ArgumentSpec didn't compute the input types correctly [(it didn't advance the offset for non-tensor args)](https://github.com/pytorch/pytorch/pull/14485/files#diff-4fd3157a056596aefb8cdf41022a208bR153) - Symbolic AD could suffer from use after free (dangling pointers in grad map), because [`EliminateDeadCode` could have removed nodes](https://github.com/pytorch/pytorch/pull/14485/files#diff-25d33ad1ed6855684dec79d927ca6142L781) that referenced gradients of certain values. - Undefined behavior in `aten::size` During my tests I've also found a few new problems, and I have opened issues for them: - FusionGroup seems to think that cat nodes broadcast their inputs (#14483) - `prim::ConstantChunk` derivative formula doesn't handle undefined inputs (#14484) This patch unfortunately deoptimizes some of our code (Fusion doesn't happen past chunk nodes, and outputs more tensors only because we have to get their size). I know how to fix those issues, but wanted to fix this terrible bug quickly. cc zou3519 zdevito ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/14485 Reviewed By: eellison Differential Revision: D13312888 Pulled By: suo fbshipit-source-id: ad46bfb4d0a306ad9451002f8270f7a790f72d58
author: Adam Paszke <adam.paszke@gmail.com> 2018-12-04 00:13:24 -0800
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> 2018-12-04 00:16:21 -0800
commit: 8812a5d42e7be858334a4f5bd6acc7e6e942815f (patch)
tree: 841dbbe407584a0c6d18571ea19edee45e7adfda /test/cpp
parent: 862b8cae51321a16936d87567e8a455186ca2d85 (diff)
download: pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.gz
pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.bz2
pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.zip
1 files changed, 22 insertions, 23 deletions
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 2a4edc1794..d728a65e9b 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -439,38 +439,38 @@ std::shared_ptr<Graph> build_lstm() {
   return r;
 }
 
-void run(InterpreterState & interp, const std::vector<at::Tensor> & inputs, std::vector<at::Tensor> & outputs) {
+std::vector<at::Tensor> run(InterpreterState & interp, const std::vector<at::Tensor> & inputs) {
   std::vector<IValue> stack(inputs.begin(), inputs.end());
   interp.run(stack);
-  outputs.clear();
-  for (auto& ivalue : stack) {
-    outputs.push_back(std::move(ivalue).toTensor());
-  }
+  return fmap(stack, [](const IValue& i) { return i.toTensor(); });
 }
 
 std::pair<tensor_list, tensor_list> runGradient(
     Gradient& grad_spec,
     tensor_list& tensors_in,
     tensor_list& tensor_grads_in) {
-  tensor_list tensors_out, tensor_grads_out;
+  static const auto as_tensorlist = [](const Stack& stack) {
+    return fmap(stack, [](const IValue& i) { return i.toTensor(); });
+  };
   Code f_code{grad_spec.f}, df_code{grad_spec.df};
   InterpreterState f_interpreter{f_code}, df_interpreter{df_code};
 
-  run(f_interpreter, tensors_in, tensors_out);
+  auto f_stack = fmap<IValue>(tensors_in);
+  f_interpreter.run(f_stack);
 
-  tensor_list df_inputs;
-  df_inputs.insert(
-      df_inputs.end(), tensor_grads_in.begin(), tensor_grads_in.end());
+  Stack df_stack;
+  df_stack.insert(
+      df_stack.end(), tensor_grads_in.begin(), tensor_grads_in.end());
   for (auto offset : grad_spec.df_input_captured_inputs)
-    df_inputs.push_back(tensors_in[offset]);
+    df_stack.push_back(tensors_in[offset]);
   for (auto offset : grad_spec.df_input_captured_outputs)
-    df_inputs.push_back(tensors_out[offset]);
-  run(df_interpreter, df_inputs, tensor_grads_out);
+    df_stack.push_back(f_stack[offset]);
+  df_interpreter.run(df_stack);
 
   // Outputs of f needs to be sliced
-  tensors_out.erase(
-      tensors_out.begin() + grad_spec.f_real_outputs, tensors_out.end());
-  return std::make_pair(tensors_out, tensor_grads_out);
+  f_stack.erase(
+      f_stack.begin() + grad_spec.f_real_outputs, f_stack.end());
+  return std::make_pair(as_tensorlist(f_stack), as_tensorlist(df_stack));
 }
 
 void assertAllClose(const tensor_list& a, const tensor_list& b) {
@@ -496,9 +496,8 @@ void testInterp() {
 
   auto lstm_g = build_lstm();
   Code lstm_function(lstm_g);
-  std::vector<at::Tensor> outputs;
   InterpreterState lstm_interp(lstm_function);
-  run(lstm_interp, {input[0], hx, cx, w_ih, w_hh}, outputs);
+  auto outputs = run(lstm_interp, {input[0], hx, cx, w_ih, w_hh});
   std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
 
   // std::cout << almostEqual(outputs[0],hx) << "\n";
@@ -836,8 +835,8 @@ void testDifferentiate(std::ostream& out = std::cout) {
 
   auto grad_spec = differentiate(graph);
   std::vector<size_t> expected_captured_inputs = {0, 1};
-  std::vector<size_t> expected_captured_outputs = {1};
-  std::vector<size_t> expected_input_vjps = {0, 1};
+  std::vector<size_t> expected_captured_outputs = {1, 2, 3, 4, 5, 6, 7};
+  std::vector<size_t> expected_input_vjps = {0, 3};
   std::vector<size_t> expected_output_vjps = {0, 1};
   ASSERT_EQ(grad_spec.f_real_outputs, 1);
   ASSERT_EQ(grad_spec.df_input_captured_inputs, expected_captured_inputs);
@@ -867,11 +866,11 @@ void testDifferentiateWithRequiresGrad(std::ostream& out = std::cout) {
   PropagateRequiresGrad(graph);
 
   auto grad_spec = differentiate(graph);
-  std::vector<size_t> expected_input_vjps = {1, 2}; // for e and %4 = (d + a)
+  std::vector<size_t> expected_input_vjps = {1, 4}; // for e and %4 = (d + a)
   std::vector<size_t> expected_output_vjps = {0}; // only a requires grad
-  ASSERT_EQ(grad_spec.f_real_outputs, 2); // we need one temporary %4 = (d + a)
+  ASSERT_EQ(grad_spec.f_real_outputs, 2);
   ASSERT_EQ(grad_spec.df_input_captured_inputs, std::vector<size_t>({0}));
-  ASSERT_EQ(grad_spec.df_input_captured_outputs, std::vector<size_t>({2}));
+  ASSERT_EQ(grad_spec.df_input_captured_outputs, std::vector<size_t>({2, 3, 4, 5, 6, 7, 8}));
   ASSERT_EQ(grad_spec.df_input_vjps, expected_input_vjps);
   ASSERT_EQ(grad_spec.df_output_vjps, expected_output_vjps);
   out << "testDifferentiateWithRequiresGrad\n";
author	Adam Paszke <adam.paszke@gmail.com>	2018-12-04 00:13:24 -0800
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	2018-12-04 00:16:21 -0800
commit	8812a5d42e7be858334a4f5bd6acc7e6e942815f (patch)
tree	841dbbe407584a0c6d18571ea19edee45e7adfda /test/cpp
parent	862b8cae51321a16936d87567e8a455186ca2d85 (diff)
download	pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.gz pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.bz2 pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.zip