1 files changed, 128 insertions, 86 deletions
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index e1a7bc4af2..26aed4acdb 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -133,7 +133,8 @@ class RNNBase(Module):
 
 
 class RNN(RNNBase):
-    r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an input sequence.
+    r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an
+    input sequence.
 
 
     For each element in the input sequence, each layer computes the following
@@ -143,40 +144,49 @@ class RNN(RNNBase):
 
         h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
 
-    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer.
-    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
+    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
+    the hidden state of the previous layer at time `t` or :math:`input_t`
+    for the first layer. If nonlinearity='relu', then `ReLU` is used instead
+    of `tanh`.
 
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
         num_layers: Number of recurrent layers.
         nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
         bidirectional: If True, becomes a bidirectional RNN. Default: False
 
     Inputs: input, h_0
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
           for details.
-        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial hidden state
-          for each element in the batch.
+        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.
 
     Outputs: output, h_n
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features (h_k)
-          from the last layer of the RNN, for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has been given
-          as the input, the output will also be a packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for k=seq_len.
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features (h_k) from the last layer of the RNN,
+          for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
+          been given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for k=seq_len.
 
     Attributes:
         weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
-                        of shape `(input_size x hidden_size)`
+            of shape `(input_size x hidden_size)`
         weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
-                        of shape `(hidden_size x hidden_size)`
-        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, of shape `(hidden_size)`
-        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, of shape `(hidden_size)`
+            of shape `(hidden_size x hidden_size)`
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
 
     Examples::
 
@@ -203,7 +213,8 @@ class RNN(RNNBase):
 
 
 class LSTM(RNNBase):
-    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
 
 
     For each element in the input sequence, each layer computes the following
@@ -220,47 +231,54 @@ class LSTM(RNNBase):
             h_t = o_t * \tanh(c_t)
             \end{array}
 
-    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell state at time `t`,
-    :math:`x_t` is the hidden state of the previous layer at time `t` or :math:`input_t` for the first layer,
-    and :math:`i_t`, :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget,
-    cell, and out gates, respectively.
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the hidden state of the previous layer at
+    time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
+    and out gates, respectively.
 
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
         num_layers: Number of recurrent layers.
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
         bidirectional: If True, becomes a bidirectional RNN. Default: False
 
     Inputs: input, (h_0, c_0)
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
-          for details.
-        - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor containing
-          the initial hidden state for each element in the batch.
-        - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor containing
-          the initial cell state for each element in the batch.
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence.
+          The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` for details.
+        - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.
+        - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor
+          containing the initial cell state for each element in the batch.
 
 
     Outputs: output, (h_n, c_n)
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing
-          the output features `(h_t)` from the last layer of the RNN, for each t. If a
-          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a
-          packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
-        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features `(h_t)` from the last layer of the RNN,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for t=seq_len
+        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the cell state for t=seq_len
 
     Attributes:
-        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer `(W_ii|W_if|W_ig|W_io)`, of shape
-                         `(4*hidden_size x input_size)`
-        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer `(W_hi|W_hf|W_hg|W_ho)`, of shape
-                         `(4*hidden_size x hidden_size)`
-        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer `(b_ii|b_if|b_ig|b_io)`, of shape
-                         `(4*hidden_size)`
-        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer `(b_hi|b_hf|b_hg|b_ho)`, of shape
-                         `(4*hidden_size)`
+        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer
+            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer
+            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
 
     Examples::
 
@@ -292,40 +310,47 @@ class GRU(RNNBase):
             \end{array}
 
     where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer,
-    and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, and new gates, respectively.
+    state of the previous layer at time `t` or :math:`input_t` for the first
+    layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
+    and new gates, respectively.
 
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
         num_layers: Number of recurrent layers.
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
-        batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
+        batch_first: If True, then the input and output tensors are provided
+            as (batch, seq, feature)
+        dropout: If non-zero, introduces a dropout layer on the outputs of each
+            RNN layer except the last layer
         bidirectional: If True, becomes a bidirectional RNN. Default: False
 
     Inputs: input, h_0
-        - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence.
-          The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+        - **input** (seq_len, batch, input_size): tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
           for details.
-        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor containing the initial
-          hidden state for each element in the batch.
+        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the initial hidden state for each element in the batch.
 
     Outputs: output, h_n
-        - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features h_t from
-          the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the
-          input, the output will also be a packed sequence.
-        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
+        - **output** (seq_len, batch, hidden_size * num_directions): tensor
+          containing the output features h_t from the last layer of the RNN,
+          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
+          containing the hidden state for t=seq_len
 
     Attributes:
-        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer (W_ir|W_iz|W_in), of shape
-                         `(3*hidden_size x input_size)`
-        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer (W_hr|W_hz|W_hn), of shape
-                         `(3*hidden_size x hidden_size)`
-        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer (b_ir|b_iz|b_in), of shape
-                         `(3*hidden_size)`
-        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer (b_hr|b_hz|b_hn), of shape
-                         `(3*hidden_size)`
+        weight_ih_l[k] : the learnable input-hidden weights of the k-th layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size x hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the k-th layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
     Examples::
 
         >>> rnn = nn.GRU(10, 20, 2)
@@ -362,19 +387,24 @@ class RNNCell(RNNCellBase):
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
-        bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
+        bias: If False, then the layer does not use bias weights b_ih and b_hh.
+            Default: True
         nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
 
     Inputs: input, hidden
         - **input** (batch, input_size): tensor containing input features
-        - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
+        - **hidden** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.
 
     Outputs: h'
-        - **h'** (batch, hidden_size): tensor containing the next hidden state for each element in the batch
+        - **h'** (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch
 
     Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
 
@@ -443,20 +473,27 @@ class LSTMCell(RNNCellBase):
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
-        bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: True
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: True
 
     Inputs: input, (h_0, c_0)
         - **input** (batch, input_size): tensor containing input features
-        - **h_0** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
-        - **c_0** (batch. hidden_size): tensor containing the initial cell state for each element in the batch.
+        - **h_0** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.
+        - **c_0** (batch. hidden_size): tensor containing the initial cell state
+          for each element in the batch.
 
     Outputs: h_1, c_1
-        - **h_1** (batch, hidden_size): tensor containing the next hidden state for each element in the batch
-        - **c_1** (batch, hidden_size): tensor containing the next cell state for each element in the batch
+        - **h_1** (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch
+        - **c_1** (batch, hidden_size): tensor containing the next cell state
+          for each element in the batch
 
     Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
 
@@ -515,18 +552,23 @@ class GRUCell(RNNCellBase):
     Args:
         input_size: The number of expected features in the input x
         hidden_size: The number of features in the hidden state h
-        bias: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`
+        bias: If `False`, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: `True`
 
     Inputs: input, hidden
         - **input** (batch, input_size): tensor containing input features
-        - **hidden** (batch, hidden_size): tensor containing the initial hidden state for each element in the batch.
+        - **hidden** (batch, hidden_size): tensor containing the initial hidden
+          state for each element in the batch.
 
     Outputs: h'
-        - **h'**: (batch, hidden_size): tensor containing the next hidden state for each element in the batch
+        - **h'**: (batch, hidden_size): tensor containing the next hidden state
+          for each element in the batch
 
     Attributes:
-        weight_ih: the learnable input-hidden weights, of shape `(input_size x hidden_size)`
-        weight_hh: the learnable hidden-hidden weights, of shape `(hidden_size x hidden_size)`
+        weight_ih: the learnable input-hidden weights, of shape
+            `(input_size x hidden_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size x hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`