diff options
author | Adam Paszke <adam.paszke@gmail.com> | 2018-12-04 00:13:24 -0800 |
---|---|---|
committer | Facebook Github Bot <facebook-github-bot@users.noreply.github.com> | 2018-12-04 00:16:21 -0800 |
commit | 8812a5d42e7be858334a4f5bd6acc7e6e942815f (patch) | |
tree | 841dbbe407584a0c6d18571ea19edee45e7adfda /test/cpp | |
parent | 862b8cae51321a16936d87567e8a455186ca2d85 (diff) | |
download | pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.gz pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.tar.bz2 pytorch-8812a5d42e7be858334a4f5bd6acc7e6e942815f.zip |
Reduce broadcasted inputs in derivative code (#14485)
Summary:
Previously symbolic AD formulas assumed that no broadcasting happened,
and would return gradients of incorrect shapes (possibly leading to
silent errors later).
Fixes a few bugs (known and unknown):
- #11736
- ArgumentSpec didn't compute the input types correctly [(it didn't advance the offset for non-tensor args)](https://github.com/pytorch/pytorch/pull/14485/files#diff-4fd3157a056596aefb8cdf41022a208bR153)
- Symbolic AD could suffer from use after free (dangling pointers in grad map), because [`EliminateDeadCode` could have removed nodes](https://github.com/pytorch/pytorch/pull/14485/files#diff-25d33ad1ed6855684dec79d927ca6142L781) that referenced gradients of certain values.
- Undefined behavior in `aten::size`
During my tests I've also found a few new problems, and I have opened issues for them:
- FusionGroup seems to think that cat nodes broadcast their inputs (#14483)
- `prim::ConstantChunk` derivative formula doesn't handle undefined inputs (#14484)
This patch unfortunately deoptimizes some of our code (Fusion doesn't happen past chunk nodes, and outputs more tensors only because we have to get their size). I know how to fix those issues, but wanted to fix this terrible bug quickly.
cc zou3519 zdevito ngimel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14485
Reviewed By: eellison
Differential Revision: D13312888
Pulled By: suo
fbshipit-source-id: ad46bfb4d0a306ad9451002f8270f7a790f72d58
Diffstat (limited to 'test/cpp')
-rw-r--r-- | test/cpp/jit/tests.h | 45 |
1 files changed, 22 insertions, 23 deletions
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 2a4edc1794..d728a65e9b 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -439,38 +439,38 @@ std::shared_ptr<Graph> build_lstm() { return r; } -void run(InterpreterState & interp, const std::vector<at::Tensor> & inputs, std::vector<at::Tensor> & outputs) { +std::vector<at::Tensor> run(InterpreterState & interp, const std::vector<at::Tensor> & inputs) { std::vector<IValue> stack(inputs.begin(), inputs.end()); interp.run(stack); - outputs.clear(); - for (auto& ivalue : stack) { - outputs.push_back(std::move(ivalue).toTensor()); - } + return fmap(stack, [](const IValue& i) { return i.toTensor(); }); } std::pair<tensor_list, tensor_list> runGradient( Gradient& grad_spec, tensor_list& tensors_in, tensor_list& tensor_grads_in) { - tensor_list tensors_out, tensor_grads_out; + static const auto as_tensorlist = [](const Stack& stack) { + return fmap(stack, [](const IValue& i) { return i.toTensor(); }); + }; Code f_code{grad_spec.f}, df_code{grad_spec.df}; InterpreterState f_interpreter{f_code}, df_interpreter{df_code}; - run(f_interpreter, tensors_in, tensors_out); + auto f_stack = fmap<IValue>(tensors_in); + f_interpreter.run(f_stack); - tensor_list df_inputs; - df_inputs.insert( - df_inputs.end(), tensor_grads_in.begin(), tensor_grads_in.end()); + Stack df_stack; + df_stack.insert( + df_stack.end(), tensor_grads_in.begin(), tensor_grads_in.end()); for (auto offset : grad_spec.df_input_captured_inputs) - df_inputs.push_back(tensors_in[offset]); + df_stack.push_back(tensors_in[offset]); for (auto offset : grad_spec.df_input_captured_outputs) - df_inputs.push_back(tensors_out[offset]); - run(df_interpreter, df_inputs, tensor_grads_out); + df_stack.push_back(f_stack[offset]); + df_interpreter.run(df_stack); // Outputs of f needs to be sliced - tensors_out.erase( - tensors_out.begin() + grad_spec.f_real_outputs, tensors_out.end()); - return std::make_pair(tensors_out, tensor_grads_out); + f_stack.erase( + f_stack.begin() + grad_spec.f_real_outputs, f_stack.end()); + return std::make_pair(as_tensorlist(f_stack), as_tensorlist(df_stack)); } void assertAllClose(const tensor_list& a, const tensor_list& b) { @@ -496,9 +496,8 @@ void testInterp() { auto lstm_g = build_lstm(); Code lstm_function(lstm_g); - std::vector<at::Tensor> outputs; InterpreterState lstm_interp(lstm_function); - run(lstm_interp, {input[0], hx, cx, w_ih, w_hh}, outputs); + auto outputs = run(lstm_interp, {input[0], hx, cx, w_ih, w_hh}); std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh); // std::cout << almostEqual(outputs[0],hx) << "\n"; @@ -836,8 +835,8 @@ void testDifferentiate(std::ostream& out = std::cout) { auto grad_spec = differentiate(graph); std::vector<size_t> expected_captured_inputs = {0, 1}; - std::vector<size_t> expected_captured_outputs = {1}; - std::vector<size_t> expected_input_vjps = {0, 1}; + std::vector<size_t> expected_captured_outputs = {1, 2, 3, 4, 5, 6, 7}; + std::vector<size_t> expected_input_vjps = {0, 3}; std::vector<size_t> expected_output_vjps = {0, 1}; ASSERT_EQ(grad_spec.f_real_outputs, 1); ASSERT_EQ(grad_spec.df_input_captured_inputs, expected_captured_inputs); @@ -867,11 +866,11 @@ void testDifferentiateWithRequiresGrad(std::ostream& out = std::cout) { PropagateRequiresGrad(graph); auto grad_spec = differentiate(graph); - std::vector<size_t> expected_input_vjps = {1, 2}; // for e and %4 = (d + a) + std::vector<size_t> expected_input_vjps = {1, 4}; // for e and %4 = (d + a) std::vector<size_t> expected_output_vjps = {0}; // only a requires grad - ASSERT_EQ(grad_spec.f_real_outputs, 2); // we need one temporary %4 = (d + a) + ASSERT_EQ(grad_spec.f_real_outputs, 2); ASSERT_EQ(grad_spec.df_input_captured_inputs, std::vector<size_t>({0})); - ASSERT_EQ(grad_spec.df_input_captured_outputs, std::vector<size_t>({2})); + ASSERT_EQ(grad_spec.df_input_captured_outputs, std::vector<size_t>({2, 3, 4, 5, 6, 7, 8})); ASSERT_EQ(grad_spec.df_input_vjps, expected_input_vjps); ASSERT_EQ(grad_spec.df_output_vjps, expected_output_vjps); out << "testDifferentiateWithRequiresGrad\n"; |