286 files changed, 3346 insertions, 3028 deletions
diff --git a/.travis.yml b/.travis.yml
index 781eb2db42..58a015cd1c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -44,4 +44,4 @@ matrix:
         python: "2.7"
         addons: true
         install: pip install pep8
-        script: pep8 setup.py
+        script: pep8
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f7b76d2098..2513f8c299 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -201,12 +201,13 @@ from docutils import nodes
 from sphinx.util.docfields import TypedField
 from sphinx import addnodes
 
+
 def patched_make_field(self, types, domain, items):
     # type: (List, unicode, Tuple) -> nodes.field
     def handle_item(fieldarg, content):
         par = nodes.paragraph()
         par += addnodes.literal_strong('', fieldarg)  # Patch: this line added
-        #par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
+        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
         #                           addnodes.literal_strong))
         if fieldarg in types:
             par += nodes.Text(' (')
diff --git a/setup.cfg b/setup.cfg
index 68859ad034..1afb2831ab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,7 @@
 [pep8]
 max-line-length = 120
+ignore = E402,E721,E731
+
+[flake8]
+max-line-length = 120
+ignore = E305,E402,E721,E731,F401,F403,F405,F811,F812,F821,F841
diff --git a/test/common.py b/test/common.py
index 46b9273e1a..48a7ae150f 100644
--- a/test/common.py
+++ b/test/common.py
@@ -12,6 +12,7 @@ from torch.autograd import Variable, Function
 
 torch.set_default_tensor_type('torch.DoubleTensor')
 
+
 def run_tests():
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument('--seed', type=int, default=123)
@@ -29,6 +30,7 @@ try:
 except ImportError:
     TEST_NUMPY = False
 
+
 def get_cpu_type(t):
     assert t.__module__ == 'torch.cuda'
     return getattr(torch, t.__class__.__name__)
@@ -155,7 +157,7 @@ def make_jacobian(input, num_out):
         return torch.zeros(input.nelement(), num_out)
     else:
         return type(input)(filter(lambda x: x is not None,
-            (make_jacobian(elem, num_out) for elem in input)))
+                                  (make_jacobian(elem, num_out) for elem in input)))
 
 
 def iter_tensors(x, only_requiring_grad=False):
@@ -206,7 +208,7 @@ def get_numerical_jacobian(fn, input, target):
             outb.copy_(fn(input))
             flat_tensor[i] = orig
 
-            outb.add_(-1,outa).div_(2*perturbation)
+            outb.add_(-1, outa).div_(2 * perturbation)
             d_tensor[i] = outb
 
     return jacobian
diff --git a/test/common_nn.py b/test/common_nn.py
index 5c43442953..174ab44e6b 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -25,14 +25,14 @@ module_tests = [
         module_name='Linear',
         constructor_args=(10, 8),
         input_size=(4, 10),
-        reference_fn=lambda i,p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
+        reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
     ),
     dict(
         module_name='Linear',
         constructor_args=(10, 8, False),
         input_size=(4, 10),
         desc='no_bias',
-        reference_fn=lambda i,p: torch.mm(i, p[0].t())
+        reference_fn=lambda i, p: torch.mm(i, p[0].t())
     ),
     dict(
         module_name='Threshold',
@@ -72,7 +72,7 @@ module_tests = [
     dict(
         module_name='Hardtanh',
         input_size=(3, 2, 5),
-        reference_fn=lambda i,_: i.clamp(-1, 1)
+        reference_fn=lambda i, _: i.clamp(-1, 1)
     ),
     dict(
         module_name='Sigmoid',
@@ -85,22 +85,22 @@ module_tests = [
     dict(
         module_name='Softmax',
         input_size=(10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1).expand(10, 20))
     ),
     dict(
         module_name='Softmax2d',
         input_size=(1, 3, 10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div(torch.exp(i).sum(1).expand_as(i))
+        reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1).expand_as(i))
     ),
     dict(
         module_name='LogSoftmax',
         input_size=(10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1).expand(10, 20)).log_()
     ),
     dict(
         module_name='LogSoftmax',
         input_size=(1, 3, 10, 20),
-        reference_fn=lambda i,_: torch.exp(i).div_(torch.exp(i).sum(1).expand_as(i)).log_(),
+        reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1).expand_as(i)).log_(),
         desc='multiparam'
     ),
     dict(
@@ -130,18 +130,18 @@ module_tests = [
     dict(
         module_name='LogSigmoid',
         input_size=(2, 3, 4),
-        reference_fn=lambda i,_: i.sigmoid().log()
+        reference_fn=lambda i, _: i.sigmoid().log()
     ),
     dict(
         module_name='Softplus',
         input_size=(10, 20),
-        reference_fn=lambda i,_: torch.log(1 + torch.exp(i))
+        reference_fn=lambda i, _: torch.log(1 + torch.exp(i))
     ),
     dict(
         module_name='Softplus',
         constructor_args=(2,),
         input_size=(10, 20),
-        reference_fn=lambda i,_: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
+        reference_fn=lambda i, _: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
         desc='beta'
     ),
     dict(
@@ -172,7 +172,7 @@ module_tests = [
     dict(
         module_name='Softsign',
         input_size=(3, 2, 5),
-        reference_fn=lambda i,_: i.div(1 + torch.abs(i))
+        reference_fn=lambda i, _: i.div(1 + torch.abs(i))
     ),
     dict(
         module_name='Softmin',
@@ -187,11 +187,11 @@ module_tests = [
 
 criterion_tests = [
     dict(module_name='L1Loss',
-        input_size=(2, 3, 4),
-        target=torch.randn(2, 3, 4),
-        reference_fn=lambda i,t,_: 1./i.numel() * \
-            sum((a-b).abs().sum() for a,b in zip(i, t))
-    ),
+         input_size=(2, 3, 4),
+         target=torch.randn(2, 3, 4),
+         reference_fn=lambda i, t, _: 1. / i.numel() *
+         sum((a - b).abs().sum() for a, b in zip(i, t))
+         ),
     dict(
         module_name='NLLLoss',
         input=torch.rand(15, 10).log(),
@@ -213,7 +213,7 @@ criterion_tests = [
         module_name='MSELoss',
         input=torch.randn(2, 3, 4, 5),
         target=torch.randn(2, 3, 4, 5),
-        reference_fn=lambda i,t,_: (i-t).abs().pow(2).sum() / i.numel()
+        reference_fn=lambda i, t, _: (i - t).abs().pow(2).sum() / i.numel()
     ),
     dict(
         module_name='BCELoss',
@@ -370,9 +370,9 @@ class NNTestCase(TestCase):
 
             if jacobian_input:
                 for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
-                    jacobian_x[:,i] = d_x
+                    jacobian_x[:, i] = d_x
             if jacobian_parameters:
-                jacobian_param[:,i] = torch.cat(self._flatten_tensors(d_param), 0)
+                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
 
         res = tuple()
         if jacobian_input:
@@ -433,7 +433,7 @@ class NNTestCase(TestCase):
                 fx1 = self._forward_criterion(criterion, input, target)
                 x[i] = original - eps
                 fx2 = self._forward_criterion(criterion, input, target)
-                deriv = (fx1 - fx2) / (2.*eps)
+                deriv = (fx1 - fx2) / (2. * eps)
                 d_x[i] = deriv
                 x[i] = original
 
@@ -447,8 +447,9 @@ class NNTestCase(TestCase):
 
 
 class TestBase(object):
+
     def __init__(self, constructor, constructor_args=tuple(), input_size=None,
-            input=None, desc='', reference_fn=None, fullname=None, **kwargs):
+                 input=None, desc='', reference_fn=None, fullname=None, **kwargs):
         if input_size is None and input is None:
             raise RuntimeError("Specify either an input tensor, or it's size!")
         self.constructor = constructor
@@ -496,6 +497,7 @@ class TestBase(object):
 
 
 class ModuleTest(TestBase):
+
     def __init__(self, *args, **kwargs):
         super(ModuleTest, self).__init__(*args, **kwargs)
         self.jacobian_input = kwargs.get('jacobian_input', True)
@@ -568,6 +570,7 @@ class ModuleTest(TestBase):
 
 
 class CriterionTest(TestBase):
+
     def __init__(self, *args, **kwargs):
         super(CriterionTest, self).__init__(*args, **kwargs)
         self.target = self._get_target(kwargs['target'])
@@ -590,7 +593,7 @@ class CriterionTest(TestBase):
             if isinstance(target, Variable):
                 target = target.data
             expected_out = self.reference_fn(deepcopy(self._unpack_input(input)),
-                    deepcopy(target), module)
+                                             deepcopy(target), module)
             test_case.assertEqual(out, expected_out)
 
         test_case.check_criterion_jacobian(module, input, self.target)
diff --git a/test/data/network1.py b/test/data/network1.py
index 9c052a1943..68fbe37696 100644
--- a/test/data/network1.py
+++ b/test/data/network1.py
@@ -2,6 +2,7 @@ import torch.nn as nn
 
 
 class Net(nn.Module):
+
     def __init__(self):
         super(Net, self).__init__()
         self.linear = nn.Linear(10, 20)
diff --git a/test/data/network2.py b/test/data/network2.py
index 8db55a11a1..862593c5fc 100644
--- a/test/data/network2.py
+++ b/test/data/network2.py
@@ -2,6 +2,7 @@ import torch.nn as nn
 
 
 class Net(nn.Module):
+
     def __init__(self):
         super(Net, self).__init__()
         self.linear = nn.Linear(10, 20)
diff --git a/test/error_messages/storage.py b/test/error_messages/storage.py
index 40dd94fffa..bde3df7dec 100644
--- a/test/error_messages/storage.py
+++ b/test/error_messages/storage.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def check_error(desc, fn, *required_substrings):
     try:
         fn()
@@ -16,54 +17,55 @@ def check_error(desc, fn, *required_substrings):
     assert False, "given function ({}) didn't raise an error".format(desc)
 
 check_error(
-        'Wrong argument types',
-        lambda: torch.FloatStorage(object()),
-        'object')
+    'Wrong argument types',
+    lambda: torch.FloatStorage(object()),
+    'object')
 
 check_error('Unknown keyword argument',
-        lambda: torch.FloatStorage(content=1234.),
-        'keyword')
+            lambda: torch.FloatStorage(content=1234.),
+            'keyword')
 
 check_error('Invalid types inside a sequence',
-        lambda: torch.FloatStorage(['a', 'b']),
-        'list', 'str')
+            lambda: torch.FloatStorage(['a', 'b']),
+            'list', 'str')
 
 check_error('Invalid size type',
-        lambda: torch.FloatStorage(1.5),
-        'float')
+            lambda: torch.FloatStorage(1.5),
+            'float')
 
 check_error('Invalid offset',
-        lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
-        '2', '4')
+            lambda: torch.FloatStorage(torch.FloatStorage(2), 4),
+            '2', '4')
 
 check_error('Negative offset',
-        lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
-        '2', '-1')
+            lambda: torch.FloatStorage(torch.FloatStorage(2), -1),
+            '2', '-1')
 
 check_error('Invalid size',
-        lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
-        '2', '1', '5')
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, 5),
+            '2', '1', '5')
 
 check_error('Negative size',
-        lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
-        '2', '1', '-5')
+            lambda: torch.FloatStorage(torch.FloatStorage(3), 1, -5),
+            '2', '1', '-5')
 
 check_error('Invalid index type',
-        lambda: torch.FloatStorage(10)['first item'],
-        'str')
+            lambda: torch.FloatStorage(10)['first item'],
+            'str')
+
 
 def assign():
     torch.FloatStorage(10)[1:-1] = '1'
 check_error('Invalid value type',
-        assign,
-        'str')
+            assign,
+            'str')
 
 check_error('resize_ with invalid type',
-        lambda: torch.FloatStorage(10).resize_(1.5),
-        'float')
+            lambda: torch.FloatStorage(10).resize_(1.5),
+            'float')
 
 check_error('fill_ with invalid type',
-        lambda: torch.IntStorage(10).fill_('asdf'),
-        'str')
+            lambda: torch.IntStorage(10).fill_('asdf'),
+            'str')
 
 # TODO: frombuffer
diff --git a/test/optim/test.py b/test/optim/test.py
index 3c6e34f40c..076b9e5d2c 100644
--- a/test/optim/test.py
+++ b/test/optim/test.py
@@ -3,10 +3,12 @@ import torch
 import torch.legacy.optim as optim
 from pprint import pprint
 
+
 def rosenbrock(tensor):
     x, y = tensor
     return (1 - x)**2 + 100 * (y - x**2)**2
 
+
 def drosenbrock(tensor):
     x, y = tensor
     return torch.DoubleTensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * x * (y - x**2)))
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e7fc2c56d4..1d7ddc48f6 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -8,7 +8,7 @@ from copy import deepcopy
 from collections import OrderedDict
 
 from common import make_jacobian, TestCase, iter_tensors, \
-                   get_numerical_jacobian, run_tests
+    get_numerical_jacobian, run_tests
 from torch.autograd._functions import *
 from torch.autograd import Variable, Function
 
@@ -46,7 +46,7 @@ def get_analytical_jacobian(input, output):
         zero_gradients(input)
         output.backward(grad_output, retain_variables=True)
         for jacobian_x, d_x in zip(jacobian, iter_gradients(input)):
-            jacobian_x[:,i] = d_x
+            jacobian_x[:, i] = d_x
 
     return jacobian
 
@@ -68,6 +68,7 @@ class TestAutograd(TestCase):
         y = Variable(torch.ones(5, 5) * 4, requires_grad=True)
 
         counter = [0]
+
         def bw_hook(inc, grad):
             self.assertIsInstance(grad, Variable)
             counter[0] += inc
@@ -103,6 +104,7 @@ class TestAutograd(TestCase):
         # WARNING: this is a test for autograd internals.
         # You should never have to use such things in your code.
         class NoneGradientFunction(Function):
+
             def forward(self, x, y):
                 assert self.needs_input_grad[0]
                 assert not self.needs_input_grad[1]
@@ -114,6 +116,7 @@ class TestAutograd(TestCase):
         fn = NoneGradientFunction()
         fn._backward_hooks = OrderedDict()
         was_called = [False]
+
         def hook(grad_input, grad_output):
             self.assertIsInstance(grad_input, tuple)
             self.assertIsInstance(grad_output, tuple)
@@ -242,6 +245,7 @@ class TestAutograd(TestCase):
         self.assertFalse(a.requires_grad)
         b = a + z
         self.assertTrue(b.requires_grad)
+
         def error():
             raise RuntimeError
         # Make sure backward isn't called on these
@@ -379,6 +383,7 @@ class TestAutograd(TestCase):
         segfault.
         """
         class CollectOnDelete(Function):
+
             def __del__(self):
                 gc.collect()
 
@@ -386,7 +391,7 @@ class TestAutograd(TestCase):
             Variable(torch.randn(10, 10), creator=CollectOnDelete())
 
     @unittest.skipIf(not torch.cuda.is_available() or torch.cuda.device_count() < 2,
-            "CUDA not available or <2 GPUs detected")
+                     "CUDA not available or <2 GPUs detected")
     def test_unused_output_gpu(self):
         from torch.nn.parallel._functions import Broadcast
         x = Variable(torch.randn(5, 5).float().cuda(), requires_grad=True)
@@ -436,6 +441,7 @@ class TestAutograd(TestCase):
 
     def test_return_leaf(self):
         class Identity(Function):
+
             def forward(self, a, b):
                 return a, a + b
 
@@ -443,6 +449,7 @@ class TestAutograd(TestCase):
                 return grad_a + grad_b, grad_b
 
         class Inplace(InplaceFunction):
+
             def forward(self, a, b):
                 self.mark_dirty(a)
                 return a.add_(b), b + 2
@@ -464,6 +471,7 @@ class TestAutograd(TestCase):
 
     def test_return_leaf_inplace(self):
         class Inplace(InplaceFunction):
+
             def forward(self, a, b):
                 self.mark_dirty(a)
                 return a.add_(b), b + 2
@@ -496,51 +504,51 @@ class TestAutograd(TestCase):
         self.assertEqual(z.grad.data, torch.ones(5) * 2)
 
     def test_backward_copy(self):
-      # This tests checks backward engine for a very subtle bug that appreared
-      # in one of the initial versions of autograd. Gradients tensors were
-      # simply stored in lists while the function waited for all its gradients
-      # to be computed. However, sometimes an output was used multiple times,
-      # so the gradients needed to be summed. Engine used to keep a need_copy
-      # set of tensors that will need a clone upon next addition and removed
-      # them from the set as soon as the clone was performed. However, this
-      # could lead to incorrect results if the same gradient tensor was
-      # buffered in three places in the graph:
-      # 1. When accumulating gradients in one of these places it was cloned
-      #    and removed from need_copy set.
-      # 2. When accumulating in second place, it wasn't in the need_copy set,
-      #    so the gradients were simply accumulated in-place (which already
-      #    modified the grad in 3rd place)
-      # 3. When accumulating in the third place, it wasn't in the need_copy set
-      #    as well, so the incoming gradient was summed in-place, yielding
-      #    incorrect results in all functions, except the first one.
-      x = Variable(torch.ones(5, 5), requires_grad=True)
-      y = Variable(torch.ones(5, 5), requires_grad=True)
-      # Simulate that we're in the middle of the graph
-      a = x + 2
-      b = y + 2
-      c = x + 2
-      # This op will just return grad_output two times in backward
-      add1 = a + b
-      add2 = add1 + c
-      # Simulate a long branch, so grad_output will get buffered.
-      for i in range(4):
-        a = a * 2
-        b = b * 2
-        c = c * 2
-      branch = a + b + c
-      out = add2 + branch
-      # expected gradients are:
-      # for x: 34 (16 from final a, 16 from final c, 2 from add2)
-      # for y: 17 (16 from final b, 1 from add2)
-      grad_output = torch.ones(5, 5)
-      out.backward(grad_output)
-      self.assertEqual(x.grad.data, torch.ones(5, 5) * 34)
-      self.assertEqual(y.grad.data, torch.ones(5, 5) * 17)
+        # This tests checks backward engine for a very subtle bug that appreared
+        # in one of the initial versions of autograd. Gradients tensors were
+        # simply stored in lists while the function waited for all its gradients
+        # to be computed. However, sometimes an output was used multiple times,
+        # so the gradients needed to be summed. Engine used to keep a need_copy
+        # set of tensors that will need a clone upon next addition and removed
+        # them from the set as soon as the clone was performed. However, this
+        # could lead to incorrect results if the same gradient tensor was
+        # buffered in three places in the graph:
+        # 1. When accumulating gradients in one of these places it was cloned
+        #    and removed from need_copy set.
+        # 2. When accumulating in second place, it wasn't in the need_copy set,
+        #    so the gradients were simply accumulated in-place (which already
+        #    modified the grad in 3rd place)
+        # 3. When accumulating in the third place, it wasn't in the need_copy set
+        #    as well, so the incoming gradient was summed in-place, yielding
+        #    incorrect results in all functions, except the first one.
+        x = Variable(torch.ones(5, 5), requires_grad=True)
+        y = Variable(torch.ones(5, 5), requires_grad=True)
+        # Simulate that we're in the middle of the graph
+        a = x + 2
+        b = y + 2
+        c = x + 2
+        # This op will just return grad_output two times in backward
+        add1 = a + b
+        add2 = add1 + c
+        # Simulate a long branch, so grad_output will get buffered.
+        for i in range(4):
+            a = a * 2
+            b = b * 2
+            c = c * 2
+        branch = a + b + c
+        out = add2 + branch
+        # expected gradients are:
+        # for x: 34 (16 from final a, 16 from final c, 2 from add2)
+        # for y: 17 (16 from final b, 1 from add2)
+        grad_output = torch.ones(5, 5)
+        out.backward(grad_output)
+        self.assertEqual(x.grad.data, torch.ones(5, 5) * 34)
+        self.assertEqual(y.grad.data, torch.ones(5, 5) * 17)
 
     def test_functional_blas(self):
         def compare(fn, *args):
             unpacked_args = tuple(arg.data if isinstance(arg, Variable) else arg
-                                    for arg in args)
+                                  for arg in args)
             self.assertEqual(fn(*args).data, fn(*unpacked_args))
 
         def test_blas_add(fn, x, y, z):
@@ -553,27 +561,29 @@ class TestAutograd(TestCase):
             compare(fn, x, y)
 
         test_blas(torch.mm, Variable(torch.randn(2, 10)),
-                Variable(torch.randn(10, 4)))
+                  Variable(torch.randn(10, 4)))
         test_blas_add(torch.addmm, Variable(torch.randn(2, 4)),
-                Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4)))
+                      Variable(torch.randn(2, 10)), Variable(torch.randn(10, 4)))
         test_blas(torch.bmm, Variable(torch.randn(4, 2, 10)),
-                Variable(torch.randn(4, 10, 4)))
+                  Variable(torch.randn(4, 10, 4)))
         test_blas_add(torch.addbmm, Variable(torch.randn(2, 4)),
-                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
+                      Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
         test_blas_add(torch.baddbmm, Variable(torch.randn(4, 2, 4)),
-                Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
+                      Variable(torch.randn(4, 2, 10)), Variable(torch.randn(4, 10, 4)))
         test_blas(torch.mv, Variable(torch.randn(2, 10)),
-                Variable(torch.randn(10)))
+                  Variable(torch.randn(10)))
         test_blas_add(torch.addmv, Variable(torch.randn(2)),
-                Variable(torch.randn(2, 10)), Variable(torch.randn(10)))
+                      Variable(torch.randn(2, 10)), Variable(torch.randn(10)))
         test_blas(torch.ger, Variable(torch.randn(5)),
-                Variable(torch.randn(6)))
+                  Variable(torch.randn(6)))
         test_blas_add(torch.addr, Variable(torch.randn(5, 6)),
-                Variable(torch.randn(5)), Variable(torch.randn(6)))
+                      Variable(torch.randn(5)), Variable(torch.randn(6)))
 
     def test_save_none_for_backward(self):
         test_case = self
+
         class MyFn(Function):
+
             def forward(self, input):
                 self.save_for_backward(None, input, None)
                 return input * input
@@ -591,6 +601,7 @@ class TestAutograd(TestCase):
 
     def test_too_many_grads(self):
         class MyFn(Function):
+
             def forward(self, input):
                 return input
 
@@ -679,6 +690,7 @@ class TestAutograd(TestCase):
 
     def test_dep_nograd(self):
         class F1(Function):
+
             def forward(self, input):
                 out = torch.randn(input.size())
                 self.mark_non_differentiable(out)
@@ -688,6 +700,7 @@ class TestAutograd(TestCase):
                 return grad_output
 
         class F2(Function):
+
             def forward(self, input, ignored):
                 return input
 
@@ -710,6 +723,7 @@ def index_variable(shape, max_indices):
     index = torch.rand(*shape).mul_(max_indices).floor_().long()
     return Variable(index, requires_grad=False)
 
+
 def gather_variable(shape, index_dim, max_indices):
     assert len(shape) == 2
     assert index_dim < 2
@@ -717,7 +731,7 @@ def gather_variable(shape, index_dim, max_indices):
     index = torch.LongTensor(*shape)
     for i in range(shape[index_dim]):
         index.select(index_dim, i).copy_(
-                torch.randperm(max_indices)[:shape[batch_dim]])
+            torch.randperm(max_indices)[:shape[batch_dim]])
     return Variable(index, requires_grad=False)
 
 
@@ -725,215 +739,215 @@ L = 20
 M = 10
 S = 5
 function_tests = [
-    (Add,           (),                 ((M, M), (M, M))                            ),
-    (Sub,           (),                 ((M, M), (M, M))                            ),
-    (Mul,           (),                 ((M, M), (M, M))                            ),
-    (Div,           (),                 ((M, M), torch.rand(M, M) + 5e-2)           ),
-    (Pow,           (),                 (torch.rand(M, M) + 1e-3, torch.rand(M, M) + 0.1)),
-    (AddConstant,   (3.14,),            ((L, L),)                                   ),
-    (SubConstant,   (3.14,),            ((L, L),)                                   ),
-    (SubConstant,   (3.14, True),       ((L, L),),                  'from_tensor'   ),
-    (MulConstant,   (3.14,),            ((L, L),)                                   ),
-    (DivConstant,   (3.14, True),       (torch.rand(L, L) + 1e-1,), 'by_tensor'     ),
-    (PowConstant,   (3.14,),            (torch.rand(L, L),)                         ),
-    (PowConstant,   (3.14, True),       (torch.rand(L, L),),        'tensor_power'  ),
-    (Transpose,     (0, 1),             (torch.rand(L, L),)                         ),
-    (Transpose,     (2, 0),             (torch.rand(S, S, S),),     '3d'            ),
-    (Permute,       ((0, 4, 3, 5, 1, 2),), ((1, 2, 3, 4, 5, 6),)                    ),
-    (Index,         ((1, 2),),          (torch.rand(S, S, S),)                      ),
-    (Index,         (slice(0, 3),),     (torch.rand(S, S, S),),     'slice'         ),
-    (Index,         ((slice(0, 3), 1),),(torch.rand(S, S, S),),     'slice_index'   ),
-    (View,          (S*S, S),           (torch.rand(S, S, S),)                      ),
-    (Expand,        ((S, 5, S, 5),),    ((S, 1, S, 1),)                             ),
-    (Exp,           (),                 (torch.rand(S, S, S),)                      ),
-    (Log,           (),                 (torch.rand(S, S, S) + 1e-2,)               ),
-    (Log1p,         (),                 (torch.rand(S, S, S),)                      ),
-    (Tanh,          (),                 ((S, S, S),)                                ),
-    (Sigmoid,       (),                 ((S, S, S),)                                ),
-    (Sinh,          (),                 ((S, S, S),)                                ),
-    (Cosh,          (),                 ((S, S, S),)                                ),
-    (Abs,           (),                 ((S, S, S),)                                ),
-    (Clamp,         (0, 1),             ((S, S, S),)                                ),
-    (Sqrt,          (),                 (torch.rand(S, S, S) + 5e-4,)               ),
-    (Sin,           (),                 ((S, S, S),)                                ),
-    (Cos,           (),                 ((S, S, S),)                                ),
-    (Tan,           (),                 (torch.randn(S, S, S).clamp(-1, 1),)        ),
-    (Asin,          (),                 (torch.randn(S, S, S).clamp(-0.9, 0.9),)    ),
-    (Acos,          (),                 (torch.randn(S, S, S).clamp(-0.9, 0.9),)    ),
-    (Atan,          (),                 ((S, S, S),)                                ),
-    (Reciprocal,    (),                 (torch.rand(S, S, S) + 0.1,)                ),
-    (Cmax,          (),                 ((S, S, S), (S, S, S))                      ),
-    (Cmin,          (),                 ((S, S, S), (S, S, S))                      ),
-    (Round,         (),                 ((S, S, S),)                                ),
-    (Sign,          (),                 ((S, S, S),)                                ),
-    (Trunc,         (),                 ((S, S, S),)                                ),
-    (Floor,         (),                 ((S, S, S),)                                ),
-    (Ceil,          (),                 ((S, S, S),)                                ),
-    (Frac,          (),                 ((S, S, S),)                                ),
-    (Fmod,          (1.5,),             ((S, S, S),)                                ),
-    (Lerp,          (0.2,),             ((S, S, S), (S, S, S))                      ),
-    (Rsqrt,         (),                 (torch.rand(S, S, S) + 1e-2,)               ),
-    (Remainder,     (1.5,),             ((S, S, S),)                                ),
-    (CmaxConstant,  (0.5,),             ((S, S, S),)                                ),
-    (CminConstant,  (0.5,),             ((S, S, S),)                                ),
-    (Mean,          (),                 ((S, S, S),)                                ),
-    (Mean,          (1,),               ((S, S, S),),               'dim'           ),
-    (Sum,           (),                 ((S, S, S),)                                ),
-    (Sum,           (1,),               ((S, S, S),),               'dim'           ),
-    (Prod,          (),                 ((S, S, S),)                                ),
-    (Prod,          (1,),               ((S, S, S),),               'dim'           ),
-    (Addmm,         (),                 ((S, M), (S, S), (S, M)),                   ),
-    (Addmm,         (0.1, 1),           ((S, M), (S, S), (S, M)),   'coef'          ),
-    (Addbmm,        (),                 ((S, M), (S, S, S), (S, S, M)),             ),
-    (Addbmm,        (0.1, 0.4),         ((S, M), (S, S, S), (S, S, M)), 'coef'      ),
-    (Baddbmm,       (),                 ((S, S, M), (S, S, S), (S, S, M)),          ),
-    (Baddbmm,       (0.1, 0.4),         ((S, S, M), (S, S, S), (S, S, M)), 'coef'   ),
-    (Addmv,         (),                 ((S,), (S, M), (M,)),                       ),
-    (Addmv,         (0.1, 0.4),         ((S,), (S, M), (M,)),       'coef'          ),
-    (Addr,          (),                 ((S, M), (S,), (M,)),                       ),
-    (Addr,          (0.1, 0.4),         ((S, M), (S,), (M,)),       'coef'          ),
-    (Dot,           (),                 ((L,), (L,)),                               ),
-    (Max,           (),                 ((S, S, S),),                               ),
-    (Min,           (),                 ((S, S, S),),                               ),
-    (Max,           (0,),               ((S, S, S),),               'dim'           ),
-    (Min,           (0,),               ((S, S, S),),               'dim'           ),
-    (Mode,          (0,),               ((S, S, S),),                               ),
-    (Kthvalue,      (2, 0),             ((S, S, S),),                               ),
-    (Median,        (0,),               ((S, S, S),),                               ),
-    (Norm,          (1.5,),             (torch.rand(S, S, S),),     '1_5'           ),
-    (Norm,          (),                 ((S, S, S),),               '2'             ),
-    (Norm,          (3,),               ((S, S, S),),               '3'             ),
-    (Norm,          (1.5, 0),           (torch.rand(S, S, S),),     '1_5_dim'       ),
-    (Norm,          (2, 0),             ((S, S, S),),               '2_dim'         ),
-    (Norm,          (3, 0),             ((S, S, S),),               '3_dim'         ),
-    (Addcmul,       (),                 ((S, S), (S, S), (S, S))                    ),
-    (Addcmul,       (0.6,),             ((S, S), (S, S), (S, S)),   'scale'         ),
-    (Addcdiv,       (),                 ((S, S), (S, S), torch.rand(S, S) + 1e-2)   ),
-    (Addcdiv,       (0.6,),             ((S, S), (S, S), torch.rand(S, S) + 1e-2), 'scale'),
-    (IndexAdd,      (0,),               ((S, S), index_variable(2, S), (2, S))      ),
+    (Add, (), ((M, M), (M, M))),
+    (Sub, (), ((M, M), (M, M))),
+    (Mul, (), ((M, M), (M, M))),
+    (Div, (), ((M, M), torch.rand(M, M) + 5e-2)),
+    (Pow, (), (torch.rand(M, M) + 1e-3, torch.rand(M, M) + 0.1)),
+    (AddConstant, (3.14,), ((L, L),)),
+    (SubConstant, (3.14,), ((L, L),)),
+    (SubConstant, (3.14, True), ((L, L),), 'from_tensor'),
+    (MulConstant, (3.14,), ((L, L),)),
+    (DivConstant, (3.14, True), (torch.rand(L, L) + 1e-1,), 'by_tensor'),
+    (PowConstant, (3.14,), (torch.rand(L, L),)),
+    (PowConstant, (3.14, True), (torch.rand(L, L),), 'tensor_power'),
+    (Transpose, (0, 1), (torch.rand(L, L),)),
+    (Transpose, (2, 0), (torch.rand(S, S, S),), '3d'),
+    (Permute, ((0, 4, 3, 5, 1, 2),), ((1, 2, 3, 4, 5, 6),)),
+    (Index, ((1, 2),), (torch.rand(S, S, S),)),
+    (Index, (slice(0, 3),), (torch.rand(S, S, S),), 'slice'),
+    (Index, ((slice(0, 3), 1),), (torch.rand(S, S, S),), 'slice_index'),
+    (View, (S * S, S), (torch.rand(S, S, S),)),
+    (Expand, ((S, 5, S, 5),), ((S, 1, S, 1),)),
+    (Exp, (), (torch.rand(S, S, S),)),
+    (Log, (), (torch.rand(S, S, S) + 1e-2,)),
+    (Log1p, (), (torch.rand(S, S, S),)),
+    (Tanh, (), ((S, S, S),)),
+    (Sigmoid, (), ((S, S, S),)),
+    (Sinh, (), ((S, S, S),)),
+    (Cosh, (), ((S, S, S),)),
+    (Abs, (), ((S, S, S),)),
+    (Clamp, (0, 1), ((S, S, S),)),
+    (Sqrt, (), (torch.rand(S, S, S) + 5e-4,)),
+    (Sin, (), ((S, S, S),)),
+    (Cos, (), ((S, S, S),)),
+    (Tan, (), (torch.randn(S, S, S).clamp(-1, 1),)),
+    (Asin, (), (torch.randn(S, S, S).clamp(-0.9, 0.9),)),
+    (Acos, (), (torch.randn(S, S, S).clamp(-0.9, 0.9),)),
+    (Atan, (), ((S, S, S),)),
+    (Reciprocal, (), (torch.rand(S, S, S) + 0.1,)),
+    (Cmax, (), ((S, S, S), (S, S, S))),
+    (Cmin, (), ((S, S, S), (S, S, S))),
+    (Round, (), ((S, S, S),)),
+    (Sign, (), ((S, S, S),)),
+    (Trunc, (), ((S, S, S),)),
+    (Floor, (), ((S, S, S),)),
+    (Ceil, (), ((S, S, S),)),
+    (Frac, (), ((S, S, S),)),
+    (Fmod, (1.5,), ((S, S, S),)),
+    (Lerp, (0.2,), ((S, S, S), (S, S, S))),
+    (Rsqrt, (), (torch.rand(S, S, S) + 1e-2,)),
+    (Remainder, (1.5,), ((S, S, S),)),
+    (CmaxConstant, (0.5,), ((S, S, S),)),
+    (CminConstant, (0.5,), ((S, S, S),)),
+    (Mean, (), ((S, S, S),)),
+    (Mean, (1,), ((S, S, S),), 'dim'),
+    (Sum, (), ((S, S, S),)),
+    (Sum, (1,), ((S, S, S),), 'dim'),
+    (Prod, (), ((S, S, S),)),
+    (Prod, (1,), ((S, S, S),), 'dim'),
+    (Addmm, (), ((S, M), (S, S), (S, M)),),
+    (Addmm, (0.1, 1), ((S, M), (S, S), (S, M)), 'coef'),
+    (Addbmm, (), ((S, M), (S, S, S), (S, S, M)),),
+    (Addbmm, (0.1, 0.4), ((S, M), (S, S, S), (S, S, M)), 'coef'),
+    (Baddbmm, (), ((S, S, M), (S, S, S), (S, S, M)),),
+    (Baddbmm, (0.1, 0.4), ((S, S, M), (S, S, S), (S, S, M)), 'coef'),
+    (Addmv, (), ((S,), (S, M), (M,)),),
+    (Addmv, (0.1, 0.4), ((S,), (S, M), (M,)), 'coef'),
+    (Addr, (), ((S, M), (S,), (M,)),),
+    (Addr, (0.1, 0.4), ((S, M), (S,), (M,)), 'coef'),
+    (Dot, (), ((L,), (L,)),),
+    (Max, (), ((S, S, S),),),
+    (Min, (), ((S, S, S),),),
+    (Max, (0,), ((S, S, S),), 'dim'),
+    (Min, (0,), ((S, S, S),), 'dim'),
+    (Mode, (0,), ((S, S, S),),),
+    (Kthvalue, (2, 0), ((S, S, S),),),
+    (Median, (0,), ((S, S, S),),),
+    (Norm, (1.5,), (torch.rand(S, S, S),), '1_5'),
+    (Norm, (), ((S, S, S),), '2'),
+    (Norm, (3,), ((S, S, S),), '3'),
+    (Norm, (1.5, 0), (torch.rand(S, S, S),), '1_5_dim'),
+    (Norm, (2, 0), ((S, S, S),), '2_dim'),
+    (Norm, (3, 0), ((S, S, S),), '3_dim'),
+    (Addcmul, (), ((S, S), (S, S), (S, S))),
+    (Addcmul, (0.6,), ((S, S), (S, S), (S, S)), 'scale'),
+    (Addcdiv, (), ((S, S), (S, S), torch.rand(S, S) + 1e-2)),
+    (Addcdiv, (0.6,), ((S, S), (S, S), torch.rand(S, S) + 1e-2), 'scale'),
+    (IndexAdd, (0,), ((S, S), index_variable(2, S), (2, S))),
     # (IndexCopy,     (0,),               ((S, S), index_variable(2, S), (2, S))      ),
-    (IndexFill,     (0, 2),             ((S, S), index_variable(2, S))              ),
-    (IndexSelect,   (0,),               ((S, S), index_variable(2, S))              ),
-    (Gather,        (0,),               ((M, S), gather_variable((S, S), 1, M))     ),
-    (Gather,        (1,),               ((M, S), gather_variable((M, S//2), 0, S)), 'dim1'),
-    (Scatter,       (0,),               ((M, S), gather_variable((S, S), 1, M), (S, S))),
-    (Scatter,       (1,),               ((M, S), gather_variable((M, S//2), 0, S), (M, S//2)), 'dim1'),
-    (Concat,        (0,),               ((1, S, S), (2, S, S), (3, S, S))           ),
-    (Resize,        (S*S, S),           ((S, S, S),)                                ),
-    (Diag,          (),                 ((S, S),),                  '2d'            ),
-    (Diag,          (),                 ((S,),),                    '1d'            ),
-    (Tril,          (),                 ((S, S),)                                   ),
-    (Tril,          (2,),               ((S, S),),                  'idx'           ),
-    (Triu,          (),                 ((S, S),)                                   ),
-    (Triu,          (2,),               ((S, S),),                  'idx'           ),
-    (Clone,         (),                 ((S, M, S),)                                ),
-    (Squeeze,       (),                 ((S, 1, M, 1),)                             ),
-    (Squeeze,       (1,),               ((S, 1, M, 1),),            'dim'           ),
-    (Unsqueeze,     (0,),               ((S, M, S),),               '0'             ),
-    (Unsqueeze,     (1,),               ((S, M, S),),               '1'             ),
+    (IndexFill, (0, 2), ((S, S), index_variable(2, S))),
+    (IndexSelect, (0,), ((S, S), index_variable(2, S))),
+    (Gather, (0,), ((M, S), gather_variable((S, S), 1, M))),
+    (Gather, (1,), ((M, S), gather_variable((M, S // 2), 0, S)), 'dim1'),
+    (Scatter, (0,), ((M, S), gather_variable((S, S), 1, M), (S, S))),
+    (Scatter, (1,), ((M, S), gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1'),
+    (Concat, (0,), ((1, S, S), (2, S, S), (3, S, S))),
+    (Resize, (S * S, S), ((S, S, S),)),
+    (Diag, (), ((S, S),), '2d'),
+    (Diag, (), ((S,),), '1d'),
+    (Tril, (), ((S, S),)),
+    (Tril, (2,), ((S, S),), 'idx'),
+    (Triu, (), ((S, S),)),
+    (Triu, (2,), ((S, S),), 'idx'),
+    (Clone, (), ((S, M, S),)),
+    (Squeeze, (), ((S, 1, M, 1),)),
+    (Squeeze, (1,), ((S, 1, M, 1),), 'dim'),
+    (Unsqueeze, (0,), ((S, M, S),), '0'),
+    (Unsqueeze, (1,), ((S, M, S),), '1'),
     # (MaskedCopy,    (),                 ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False), (S, S),)),
-    (MaskedFill,    (10,),              ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
-    (MaskedSelect,  (),                 ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
-    (Sort,          (),                 ((S, M, S),)                               ),
-    (Sort,          (1,),               ((S, M, S),),               'dim'           ),
-    (Sort,          (1, True),          ((S, M, S),),               'dim_desc'      ),
-    (Topk,          (3,),               ((S, M, S),)                               ),
-    (Topk,          (3, 1),             ((S, M, S),),               'dim'           ),
-    (Topk,          (3, 1, True),       ((S, M, S),),               'dim_desc'      ),
-    (Topk,          (3, 1, True, True), ((S, M, S),),               'dim_desc_sort' ),
+    (MaskedFill, (10,), ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
+    (MaskedSelect, (), ((S, S), Variable(torch.randn(S, S).gt(0), requires_grad=False))),
+    (Sort, (), ((S, M, S),)),
+    (Sort, (1,), ((S, M, S),), 'dim'),
+    (Sort, (1, True), ((S, M, S),), 'dim_desc'),
+    (Topk, (3,), ((S, M, S),)),
+    (Topk, (3, 1), ((S, M, S),), 'dim'),
+    (Topk, (3, 1, True), ((S, M, S),), 'dim_desc'),
+    (Topk, (3, 1, True, True), ((S, M, S),), 'dim_desc_sort'),
 ]
 
 
 method_tests = [
-    ('add',         (S, S, S),          ((S, S, S),)                                ),
-    ('add',         (S, S, S),          (3.14,),                    'constant'      ),
-    ('sub',         (S, S, S),          ((S, S, S),)                                ),
-    ('sub',         (S, S, S),          (3.14,),                    'constant'      ),
-    ('mul',         (S, S, S),          ((S, S, S),)                                ),
-    ('mul',         (S, S, S),          (3.14,),                    'constant'      ),
-    ('div',         (S, S, S),          ((S, S, S),)                                ),
-    ('div',         (S, S, S),          (3.14,),                    'constant'      ),
-    ('pow',         (S, S, S),          ((S, S, S),)                                ),
-    ('pow',         (S, S, S),          (3.14,),                    'constant'      ),
-    ('transpose',   (1, 2, 3),          (1, 2)                                      ),
-    ('t',           (1, 2),             ()                                          ),
-    ('view',        (S, S, S),          (S*S, S),                                   ),
-    ('view_as',      (S, S, S),          ((S*S, S),)                                ),
-    ('expand',      (S, 1, S),          (S, S, S)                                   ),
-    ('expand',      (torch.Size([S, 1, S]),), (S, S, S),            'size'          ),
-    ('exp',         (S, S, S),          ()                                          ),
-    ('log',         (S, S, S),          ()                                          ),
-    ('log1p',       (S, S, S),          ()                                          ),
-    ('tanh',        (S, S, S),          ()                                          ),
-    ('sigmoid',     (S, S, S),          ()                                          ),
-    ('sinh',        (S, S, S),          ()                                          ),
-    ('cosh',        (S, S, S),          ()                                          ),
-    ('abs',         (S, S, S),          ()                                          ),
-    ('clamp',       (S, S, S),          (0, 1)                                      ),
-    ('sqrt',        (S, S, S),          ()                                          ),
-    ('sin',         (S, S, S),          ()                                          ),
-    ('cos',         (S, S, S),          ()                                          ),
-    ('tan',         (S, S, S),          ()                                          ),
-    ('asin',        (S, S, S),          ()                                          ),
-    ('acos',        (S, S, S),          ()                                          ),
-    ('atan',        (S, S, S),          ()                                          ),
-    ('reciprocal',  (S, S, S),          ()                                          ),
-    ('round',       (S, S, S),          ()                                          ),
-    ('sign',        (S, S, S),          ()                                          ),
-    ('trunc',       (S, S, S),          ()                                          ),
-    ('floor',       (S, S, S),          ()                                          ),
-    ('ceil',        (S, S, S),          ()                                          ),
-    ('rsqrt',       (S, S, S),          ()                                          ),
-    ('fmod',        (S, S, S),          (1.5,)                                      ),
-    ('remainder',   (S, S, S),          (1.5,)                                      ),
-    ('lerp',        (S, S, S),          ((S, S, S), 0.4)                            ),
-    ('max',         (S, S, S),          ()                                          ),
-    ('max',         (S, S, S),          ((S, S, S),),               'elementwise'   ),
-    ('min',         (S, S, S),          ()                                          ),
-    ('min',         (S, S, S),          ((S, S, S),),               'elementwise'   ),
-    ('mean',        (S, S, S),          ()                                          ),
-    ('mean',        (S, S, S),          (1,),                       'dim'           ),
-    ('sum',         (S, S, S),          ()                                          ),
-    ('sum',         (S, S, S),          (1,),                       'dim'           ),
-    ('prod',        (S, S, S),          ()                                          ),
-    ('prod',        (S, S, S),          (1,),                       'dim'           ),
-    ('addmm',       (S, M),             ((S, S), (S, M)),                           ),
-    ('addmm',       (S, M),             (0.2, 0.6, (S, S), (S, M)), 'coef'          ),
-    ('addbmm',      (S, M),             ((S, S, S), (S, S, M)),                     ),
-    ('addbmm',      (S, M),             (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'    ),
-    ('baddbmm',     (S, S, M),          ((S, S, S), (S, S, M)),                     ),
-    ('baddbmm',     (S, S, M),          (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'    ),
-    ('addmv',       (S,),               ((S, M), (M,)),                             ),
-    ('addmv',       (S,),               (0.2, 0.6, (S, M), (M,)),   'coef'          ),
-    ('addr',        (S, M),             ((S,), (M,)),                               ),
-    ('addr',        (S, M),             (0.2, 0.6, (S,), (M,)),     'coef'          ),
-    ('dot',         (L,),               ((L,),),                                    ),
-    ('addcmul',     (S, S),             ((S, S), (S, S))                            ),
-    ('addcmul',     (S, S),             (0.5, (S, S), (S, S)),      'scale'         ),
-    ('addcdiv',     (S, S),             ((S, S), (S, S))                            ),
-    ('addcdiv',     (S, S),             (0.5, (S, S), (S, S)),      'scale'         ),
-    ('norm',        (S, S, S),          (2,)                                        ),
-    ('norm',        (S, S, S),          (2, 1),                     'dim'           ),
-    ('dist',        (S, S, S),          ((S, S, S),)                                ),
-    ('dist',        (S, S, S),          ((S, S, S), 4),             '4'             ),
-    ('index_select', (S, S, S),         (0, index_variable(2, S))                   ),
-    ('diag',        (M, M),             (),                         '2d'            ),
-    ('diag',        (M,),               (),                         '1d'            ),
-    ('tril',        (M, M),             ()                                          ),
-    ('triu',        (M, M),             ()                                          ),
-    ('clone',       (S, M, S),          ()                                          ),
-    ('permute',     (1, 2, 3, 4),       (0, 2, 3, 1)                                ),
-    ('select',      (S, S, S),          (1, 2)                                      ),
-    ('narrow',      (S, S, S),          (1, 2, 2)                                   ),
-    ('squeeze',     (S, 1, S, 1),       ()                                          ),
-    ('squeeze',     (S, 1, S, 1),       (1,),                       '1_dim'         ),
-    ('squeeze',     (S, 1, S, 1),       (2,),                       'not_1_dim'     ),
-    ('unsqueeze',   (S, S, S),          (0,),                       'first'         ),
-    ('unsqueeze',   (S, S, S),          (1,),                       'middle'        ),
-    ('unsqueeze',   (S, S, S),          (3,),                       'last'          ),
-    ('masked_select', (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False),)           ),
-    ('masked_fill_',  (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), 10)        ),
-    ('masked_copy_',  (M, M),           (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), (M, M))    ),
+    ('add', (S, S, S), ((S, S, S),)),
+    ('add', (S, S, S), (3.14,), 'constant'),
+    ('sub', (S, S, S), ((S, S, S),)),
+    ('sub', (S, S, S), (3.14,), 'constant'),
+    ('mul', (S, S, S), ((S, S, S),)),
+    ('mul', (S, S, S), (3.14,), 'constant'),
+    ('div', (S, S, S), ((S, S, S),)),
+    ('div', (S, S, S), (3.14,), 'constant'),
+    ('pow', (S, S, S), ((S, S, S),)),
+    ('pow', (S, S, S), (3.14,), 'constant'),
+    ('transpose', (1, 2, 3), (1, 2)),
+    ('t', (1, 2), ()),
+    ('view', (S, S, S), (S * S, S),),
+    ('view_as', (S, S, S), ((S * S, S),)),
+    ('expand', (S, 1, S), (S, S, S)),
+    ('expand', (torch.Size([S, 1, S]),), (S, S, S), 'size'),
+    ('exp', (S, S, S), ()),
+    ('log', (S, S, S), ()),
+    ('log1p', (S, S, S), ()),
+    ('tanh', (S, S, S), ()),
+    ('sigmoid', (S, S, S), ()),
+    ('sinh', (S, S, S), ()),
+    ('cosh', (S, S, S), ()),
+    ('abs', (S, S, S), ()),
+    ('clamp', (S, S, S), (0, 1)),
+    ('sqrt', (S, S, S), ()),
+    ('sin', (S, S, S), ()),
+    ('cos', (S, S, S), ()),
+    ('tan', (S, S, S), ()),
+    ('asin', (S, S, S), ()),
+    ('acos', (S, S, S), ()),
+    ('atan', (S, S, S), ()),
+    ('reciprocal', (S, S, S), ()),
+    ('round', (S, S, S), ()),
+    ('sign', (S, S, S), ()),
+    ('trunc', (S, S, S), ()),
+    ('floor', (S, S, S), ()),
+    ('ceil', (S, S, S), ()),
+    ('rsqrt', (S, S, S), ()),
+    ('fmod', (S, S, S), (1.5,)),
+    ('remainder', (S, S, S), (1.5,)),
+    ('lerp', (S, S, S), ((S, S, S), 0.4)),
+    ('max', (S, S, S), ()),
+    ('max', (S, S, S), ((S, S, S),), 'elementwise'),
+    ('min', (S, S, S), ()),
+    ('min', (S, S, S), ((S, S, S),), 'elementwise'),
+    ('mean', (S, S, S), ()),
+    ('mean', (S, S, S), (1,), 'dim'),
+    ('sum', (S, S, S), ()),
+    ('sum', (S, S, S), (1,), 'dim'),
+    ('prod', (S, S, S), ()),
+    ('prod', (S, S, S), (1,), 'dim'),
+    ('addmm', (S, M), ((S, S), (S, M)),),
+    ('addmm', (S, M), (0.2, 0.6, (S, S), (S, M)), 'coef'),
+    ('addbmm', (S, M), ((S, S, S), (S, S, M)),),
+    ('addbmm', (S, M), (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'),
+    ('baddbmm', (S, S, M), ((S, S, S), (S, S, M)),),
+    ('baddbmm', (S, S, M), (0.2, 0.6, (S, S, S), (S, S, M)), 'coef'),
+    ('addmv', (S,), ((S, M), (M,)),),
+    ('addmv', (S,), (0.2, 0.6, (S, M), (M,)), 'coef'),
+    ('addr', (S, M), ((S,), (M,)),),
+    ('addr', (S, M), (0.2, 0.6, (S,), (M,)), 'coef'),
+    ('dot', (L,), ((L,),),),
+    ('addcmul', (S, S), ((S, S), (S, S))),
+    ('addcmul', (S, S), (0.5, (S, S), (S, S)), 'scale'),
+    ('addcdiv', (S, S), ((S, S), (S, S))),
+    ('addcdiv', (S, S), (0.5, (S, S), (S, S)), 'scale'),
+    ('norm', (S, S, S), (2,)),
+    ('norm', (S, S, S), (2, 1), 'dim'),
+    ('dist', (S, S, S), ((S, S, S),)),
+    ('dist', (S, S, S), ((S, S, S), 4), '4'),
+    ('index_select', (S, S, S), (0, index_variable(2, S))),
+    ('diag', (M, M), (), '2d'),
+    ('diag', (M,), (), '1d'),
+    ('tril', (M, M), ()),
+    ('triu', (M, M), ()),
+    ('clone', (S, M, S), ()),
+    ('permute', (1, 2, 3, 4), (0, 2, 3, 1)),
+    ('select', (S, S, S), (1, 2)),
+    ('narrow', (S, S, S), (1, 2, 2)),
+    ('squeeze', (S, 1, S, 1), ()),
+    ('squeeze', (S, 1, S, 1), (1,), '1_dim'),
+    ('squeeze', (S, 1, S, 1), (2,), 'not_1_dim'),
+    ('unsqueeze', (S, S, S), (0,), 'first'),
+    ('unsqueeze', (S, S, S), (1,), 'middle'),
+    ('unsqueeze', (S, S, S), (3,), 'last'),
+    ('masked_select', (M, M), (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False),)),
+    ('masked_fill_', (M, M), (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), 10)),
+    ('masked_copy_', (M, M), (Variable(torch.ByteTensor(M, M).bernoulli_(), requires_grad=False), (M, M))),
 ]
 # TODO: mm, bmm, mv, ger
 # TODO: max, min with dim (problem with indices)
@@ -946,6 +960,7 @@ method_tests = [
 def create_input(call_args):
     if not isinstance(call_args, tuple):
         call_args = (call_args,)
+
     def map_arg(arg):
         if isinstance(arg, tuple) and not isinstance(arg[0], Variable):
             return Variable(torch.randn(*arg).double(), requires_grad=True)
@@ -976,8 +991,9 @@ ignore_inplace = set((
 for test in function_tests:
     cls, constructor_args, call_args = test[:3]
     test_name = 'test_' + cls.__name__ + ('_' + test[3] if len(test) == 4 else '')
+
     def do_test(self, cls=cls, constructor_args=constructor_args,
-            call_args=call_args, test_name=test_name):
+                call_args=call_args, test_name=test_name):
         input = create_input(call_args)
         output = cls(*constructor_args)(*input)
         if not isinstance(output, tuple):
@@ -986,6 +1002,7 @@ for test in function_tests:
             if not o.requires_grad:
                 continue
             analytical = get_analytical_jacobian(input, o)
+
             def fn(input):
                 tmp = cls(*constructor_args)(*input)
                 if not isinstance(tmp, tuple):
@@ -1032,6 +1049,7 @@ EXCLUDE_FUNCTIONAL = {
 for test in method_tests:
     name, self_size, args = test[:3]
     test_name = 'test_' + name + ('_' + test[3] if len(test) == 4 else '')
+
     def do_test(self, name=name, self_size=self_size, args=args, test_name=test_name):
         def check(name):
             self_variable = create_input((self_size,))[0]
@@ -1064,7 +1082,6 @@ for test in method_tests:
                 if not 'only supports scalar' in e.args[0]:
                     raise
 
-
     assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
     setattr(TestAutograd, test_name, do_test)
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
index fa359cb642..2d5f06bad2 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -14,6 +14,7 @@ if not torch.cuda.is_available():
     import sys
     sys.exit()
 
+
 def is_floating(t):
     return type(t) in [torch.FloatTensor, torch.DoubleTensor,
                        torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
@@ -31,7 +32,8 @@ types = [
 float_types = [
     torch.FloatTensor,
     torch.DoubleTensor
-] # TODO: add half...
+]  # TODO: add half...
+
 
 def number(floating, integer, t):
     name = type(t).__name__
@@ -44,188 +46,204 @@ def number(floating, integer, t):
 S = 10
 M = 50
 
+
 def make_tensor(t, *sizes):
     return t(*sizes).copy_(torch.randn(*sizes))
 
+
 def small_2d(t):
     return make_tensor(t, S, S)
 
+
 def small_2d_scaled(t, scale=10):
     return make_tensor(t, S, S).mul(scale)
 
+
 def small_3d(t):
     return make_tensor(t, S, S, S)
 
+
 def medium_1d(t):
     return make_tensor(t, M)
 
+
 def medium_2d(t):
     return make_tensor(t, M, M)
 
+
 def medium_2d_scaled(t, scale=10):
     return make_tensor(t, M, M).mul(scale)
 
+
 def small_3d_ones(t):
     return t(S, S, S).copy_(torch.ones(S, S, S))
 
+
 def small_3d_positive(t):
     min_val = 1e-3 if is_floating(t) else 2
     return make_tensor(t, S, S, S).clamp_(min_val, 120)
 
+
 def small_3d_unique(t):
-    return t(S, S, S).copy_(torch.range(1, S*S*S))
+    return t(S, S, S).copy_(torch.range(1, S * S * S))
+
 
 def small_1d_lapack(t):
     return t(1, 3).copy_(torch.range(1, 3).view(3))
 
+
 def small_2d_lapack(t):
     return t(3, 3).copy_(torch.range(1, 9).view(3, 3))
 
+
 def small_2d_lapack_skinny(t):
     return t(3, 4).copy_(torch.range(1, 12).view(3, 4))
 
+
 def small_2d_lapack_fat(t):
     return t(4, 3).copy_(torch.range(1, 12).view(4, 3))
 
+
 def new_t(*sizes):
     def tmp(t):
         return t(*sizes).copy_(torch.randn(*sizes))
     return tmp
 
 tests = [
-    ('add',           small_3d,           lambda t: [number(3.14, 3, t)]                                    ),
-    ('add',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('add',           small_3d,           lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor' ),
-    ('sub',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('sub',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('mul',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('mul',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('div',           small_3d,           lambda t: [number(3.14, 3, t)],                                   ),
-    ('div',           small_3d,           lambda t: [small_3d_positive(t)],                 'tensor'        ),
-    ('pow',           small_3d,           lambda t: [number(3.14, 3, t)],                    None,    float_types),
-    ('pow',           small_3d,           lambda t: [small_3d(t).abs_()],                   'tensor', float_types),
-    ('addbmm',        small_2d,           lambda t: [small_3d(t), small_3d(t)],              None,    float_types),
-    ('addbmm',        small_2d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('addbmm',        small_2d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
-    ('baddbmm',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
-    ('baddbmm',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('baddbmm',       small_3d,           lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars' ),
-    ('addcdiv',       small_2d_lapack,    lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],        ),
-    ('addcdiv',       small_2d_lapack,    lambda t: [number(2.8, 1, t), small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar' ),
-    ('addcmul',       small_3d,           lambda t: [small_3d(t), small_3d(t)],                             ),
-    ('addcmul',       small_3d,           lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar' ),
-    ('addmm',         medium_2d,          lambda t: [medium_2d(t), medium_2d(t)],                           ),
-    ('addmm',         medium_2d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar' ),
-    ('addmm',         medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'   ),
-    ('addmv',         medium_1d,          lambda t: [medium_2d(t), medium_1d(t)],                           ),
-    ('addmv',         medium_1d,          lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar' ),
-    ('addmv',         medium_1d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'   ),
-    ('addr',          medium_2d,          lambda t: [medium_1d(t), medium_1d(t)],                           ),
-    ('addr',          medium_2d,          lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar' ),
-    ('addr',          medium_2d,          lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'   ),
-    ('atan2',         medium_2d,          lambda t: [medium_2d(t)],                          None,    float_types),
-    ('fmod',          small_3d,           lambda t: [3],                                  'value'           ),
-    ('fmod',          small_3d,           lambda t: [small_3d_positive(t)],               'tensor'          ),
-    ('chunk',         medium_2d,          lambda t: [4],                                                    ),
-    ('chunk',         medium_2d,          lambda t: [4, 1],                                 'dim'           ),
-    ('clamp',         medium_2d_scaled,   lambda t: [-1, 5],                                                ),
-    ('clone',         medium_2d,          lambda t: [],                                                     ),
-    ('contiguous',    medium_2d,          lambda t: [],                                                     ),
-    ('cross',         new_t(M, 3, M),     lambda t: [new_t(M, 3, M)(t)],                                    ),
-    ('cumprod',       small_3d,           lambda t: [1],                                                    ),
-    ('cumsum',        small_3d,           lambda t: [1],                                                    ),
-    ('dim',           small_3d,           lambda t: [],                                                     ),
-    ('dist',          small_2d,           lambda t: [small_2d(t)],                                          ),
-    ('dist',          small_2d,           lambda t: [small_2d(t), 3],                       '3_norm'        ),
-    ('dist',          small_2d,           lambda t: [small_2d(t), 2.5],                     '2_5_norm'      ),
-    ('dot',           medium_1d,          lambda t: [medium_1d(t)],                                         ),
-    ('element_size',  medium_1d,          lambda t: [],                                                     ),
-    ('eq',            small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('eq',            small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('ne',            small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('ne',            small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('equal',         small_3d_ones,      lambda t: [small_3d_ones(t)],                     'equal'         ),
-    ('equal',         small_3d_ones,      lambda t: [small_3d(t)],                                          ),
-    ('expand',        new_t(M, 1, M),     lambda t: [M, 4, M],                                              ),
-    ('expand_as',     new_t(M, 1, M),     lambda t: [new_t(M, 4, M)(t)],                                    ),
-    ('fill',          medium_2d,          lambda t: [number(3.14, 3, t)],                                   ),
-    ('ge',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('le',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('gt',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('lt',            medium_2d,          lambda t: [medium_2d(t)],                                         ),
-    ('is_contiguous', medium_2d,          lambda t: [],                                                     ),
+    ('add', small_3d, lambda t: [number(3.14, 3, t)]),
+    ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
+    ('sub', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('mul', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('div', small_3d, lambda t: [number(3.14, 3, t)],),
+    ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [small_3d(t).abs_()], 'tensor', float_types),
+    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
+    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
+    ('addcdiv', small_2d_lapack, lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],),
+    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t),
+                                            small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar'),
+    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],),
+    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],),
+    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
+    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
+    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
+    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
+    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
+    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],),
+    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
+    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
+    ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types),
+    ('fmod', small_3d, lambda t: [3], 'value'),
+    ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('chunk', medium_2d, lambda t: [4],),
+    ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
+    ('clamp', medium_2d_scaled, lambda t: [-1, 5],),
+    ('clone', medium_2d, lambda t: [],),
+    ('contiguous', medium_2d, lambda t: [],),
+    ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
+    ('cumprod', small_3d, lambda t: [1],),
+    ('cumsum', small_3d, lambda t: [1],),
+    ('dim', small_3d, lambda t: [],),
+    ('dist', small_2d, lambda t: [small_2d(t)],),
+    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
+    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
+    ('dot', medium_1d, lambda t: [medium_1d(t)],),
+    ('element_size', medium_1d, lambda t: [],),
+    ('eq', small_3d_ones, lambda t: [small_3d(t)],),
+    ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('ne', small_3d_ones, lambda t: [small_3d(t)],),
+    ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
+    ('equal', small_3d_ones, lambda t: [small_3d(t)],),
+    ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
+    ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
+    ('fill', medium_2d, lambda t: [number(3.14, 3, t)],),
+    ('ge', medium_2d, lambda t: [medium_2d(t)],),
+    ('le', medium_2d, lambda t: [medium_2d(t)],),
+    ('gt', medium_2d, lambda t: [medium_2d(t)],),
+    ('lt', medium_2d, lambda t: [medium_2d(t)],),
+    ('is_contiguous', medium_2d, lambda t: [],),
     # TODO: can't check negative case - GPU copy will be contiguous
-    ('is_same_size',  medium_2d,          lambda t: [small_3d(t)],                          'negative'      ),
-    ('is_same_size',  medium_2d,          lambda t: [medium_2d(t)],                         'positive'      ),
-    ('is_set_to',     medium_2d,          lambda t: [medium_2d(t)],                                         ),
+    ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
+    ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
+    ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
     # TODO: positive case
-    ('kthvalue',      small_3d_unique,    lambda t: [3],                                                    ),
-    ('kthvalue',      small_3d_unique,    lambda t: [3, 1],                                 'dim'           ),
-    ('lerp',          small_3d,           lambda t: [small_3d(t), 0.3],                                     ),
-    ('max',           small_3d_unique,    lambda t: [],                                                     ),
-    ('max',           small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('max',           medium_2d,          lambda t: [medium_2d(t)],                         'elementwise'   ),
-    ('min',           small_3d_unique,    lambda t: [],                                                     ),
-    ('min',           small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('min',           medium_2d,          lambda t: [medium_2d(t)],                         'elementwise'   ),
-    ('mean',          small_3d,           lambda t: [],                                                     ),
-    ('mean',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('mode',          small_3d,           lambda t: [],                                                     ),
-    ('mode',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('remainder',     small_3d,           lambda t: [3],                                  'value'           ),
-    ('remainder',     small_3d,           lambda t: [small_3d_positive(t)],               'tensor'          ),
-    ('std',           small_3d,           lambda t: [],                                                     ),
-    ('std',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('var',           small_3d,           lambda t: [],                                                     ),
-    ('var',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('ndimension',    small_3d,           lambda t: [],                                                     ),
-    ('nelement',      small_3d,           lambda t: [],                                                     ),
-    ('numel',         small_3d,           lambda t: [],                                                     ),
-    ('narrow',        small_3d,           lambda t: [1, 3, 2],                                              ),
-    ('nonzero',       small_3d,           lambda t: [],                                                     ),
-    ('norm',          small_3d,           lambda t: [],                                                     ),
-    ('norm',          small_3d,           lambda t: [3],                                    '3_norm'        ),
-    ('norm',          small_3d,           lambda t: [3, 0],                                 '3_norm_dim'    ),
-    ('ones',          small_3d,           lambda t: [1, 2, 3, 4, 5],                                        ),
-    ('permute',       new_t(1, 2, 3, 4),  lambda t: [2, 1, 3, 0],                                           ),
-    ('prod',          small_3d,           lambda t: [],                                                     ),
-    ('prod',          small_3d,           lambda t: [1],                                    'dim'           ),
-    ('sum',           small_2d,           lambda t: [],                                                     ),
-    ('sum',           small_3d,           lambda t: [1],                                    'dim'           ),
-    ('renorm',        small_3d,           lambda t: [2, 1, 1],                              '2_norm'        ),
-    ('renorm',        small_3d,           lambda t: [1.5, 1, 1],                            '1_5_norm'      ),
-    ('repeat',        small_2d,           lambda t: [2, 2, 2],                                              ),
-    ('size',          new_t(1, 2, 3, 4),  lambda t: [],                                                     ),
-    ('sort',          small_3d_unique,    lambda t: [],                                                     ),
-    ('sort',          small_3d_unique,    lambda t: [1],                                    'dim'           ),
-    ('sort',          small_3d_unique,    lambda t: [1, True],                              'dim_descending'),
-    ('split',         small_3d,           lambda t: [2],                                                    ),
-    ('split',         small_3d,           lambda t: [2, 1],                                 'dim'           ),
-    ('squeeze',       new_t(1, 2, 1, 4),  lambda t: [],                                                     ),
-    ('squeeze',       new_t(1, 2, 1, 4),  lambda t: [2],                                    'dim'           ),
-    ('t',             new_t(1, 2),        lambda t: [],                                                     ),
-    ('transpose',     new_t(1, 2, 3, 4),  lambda t: [1, 2],                                                 ),
-    ('to_list',       small_3d,           lambda t: [],                                                     ),
-    ('topk',          small_3d,           lambda t: [2, 1, False, True],                    'dim_sort'      ),
-    ('topk',          small_3d,           lambda t: [2, 1, True, True],                     'dim_desc_sort' ),
-    ('trace',         medium_2d,          lambda t: [],                                                     ),
-    ('tril',          medium_2d,          lambda t: [],                                                     ),
-    ('tril',          medium_2d,          lambda t: [2],                                    'positive'      ),
-    ('tril',          medium_2d,          lambda t: [-2],                                   'negative'      ),
-    ('triu',          medium_2d,          lambda t: [],                                                     ),
-    ('triu',          medium_2d,          lambda t: [2],                                    'positive'      ),
-    ('triu',          medium_2d,          lambda t: [-2],                                   'negative'      ),
-    ('view',          small_3d,           lambda t: [100, 10],                                              ),
-    ('view_as',       small_3d,           lambda t: [t(100, 10)],                                           ),
-    ('zero',          small_3d,           lambda t: [],                                                     ),
-    ('zeros',         small_3d,           lambda t: [1, 2, 3, 4],                                           ),
-    ('rsqrt',         lambda t: small_3d(t) + 1,                lambda t: [], None,              float_types),
-    ('sinh',          lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
-    ('tan',           lambda t: small_3d(t).clamp(-1, 1),       lambda t: [], None,              float_types),
+    ('kthvalue', small_3d_unique, lambda t: [3],),
+    ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
+    ('lerp', small_3d, lambda t: [small_3d(t), 0.3],),
+    ('max', small_3d_unique, lambda t: [],),
+    ('max', small_3d_unique, lambda t: [1], 'dim'),
+    ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('min', small_3d_unique, lambda t: [],),
+    ('min', small_3d_unique, lambda t: [1], 'dim'),
+    ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
+    ('mean', small_3d, lambda t: [],),
+    ('mean', small_3d, lambda t: [1], 'dim'),
+    ('mode', small_3d, lambda t: [],),
+    ('mode', small_3d, lambda t: [1], 'dim'),
+    ('remainder', small_3d, lambda t: [3], 'value'),
+    ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
+    ('std', small_3d, lambda t: [],),
+    ('std', small_3d, lambda t: [1], 'dim'),
+    ('var', small_3d, lambda t: [],),
+    ('var', small_3d, lambda t: [1], 'dim'),
+    ('ndimension', small_3d, lambda t: [],),
+    ('nelement', small_3d, lambda t: [],),
+    ('numel', small_3d, lambda t: [],),
+    ('narrow', small_3d, lambda t: [1, 3, 2],),
+    ('nonzero', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [],),
+    ('norm', small_3d, lambda t: [3], '3_norm'),
+    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
+    ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
+    ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
+    ('prod', small_3d, lambda t: [],),
+    ('prod', small_3d, lambda t: [1], 'dim'),
+    ('sum', small_2d, lambda t: [],),
+    ('sum', small_3d, lambda t: [1], 'dim'),
+    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
+    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
+    ('repeat', small_2d, lambda t: [2, 2, 2],),
+    ('size', new_t(1, 2, 3, 4), lambda t: [],),
+    ('sort', small_3d_unique, lambda t: [],),
+    ('sort', small_3d_unique, lambda t: [1], 'dim'),
+    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
+    ('split', small_3d, lambda t: [2],),
+    ('split', small_3d, lambda t: [2, 1], 'dim'),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
+    ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
+    ('t', new_t(1, 2), lambda t: [],),
+    ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
+    ('to_list', small_3d, lambda t: [],),
+    ('topk', small_3d, lambda t: [2, 1, False, True], 'dim_sort'),
+    ('topk', small_3d, lambda t: [2, 1, True, True], 'dim_desc_sort'),
+    ('trace', medium_2d, lambda t: [],),
+    ('tril', medium_2d, lambda t: [],),
+    ('tril', medium_2d, lambda t: [2], 'positive'),
+    ('tril', medium_2d, lambda t: [-2], 'negative'),
+    ('triu', medium_2d, lambda t: [],),
+    ('triu', medium_2d, lambda t: [2], 'positive'),
+    ('triu', medium_2d, lambda t: [-2], 'negative'),
+    ('view', small_3d, lambda t: [100, 10],),
+    ('view_as', small_3d, lambda t: [t(100, 10)],),
+    ('zero', small_3d, lambda t: [],),
+    ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
+    ('rsqrt', lambda t: small_3d(t) + 1, lambda t: [], None, float_types),
+    ('sinh', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
+    ('tan', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
     # lapack tests
-    ('qr',            small_2d_lapack,           lambda t: [],   'square',                       float_types),
-    ('qr',            small_2d_lapack_skinny,    lambda t: [],   'skinny',                       float_types),
-    ('qr',            small_2d_lapack_fat,       lambda t: [],   'fat',                          float_types),
+    ('qr', small_2d_lapack, lambda t: [], 'square', float_types),
+    ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types),
+    ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types),
 
 ]
 
@@ -275,6 +293,8 @@ for fn in simple_pointwise_float:
     tests.append((fn, small_3d, lambda t: [], None, float_types))
 
 _cycles_per_ms = None
+
+
 def get_cycles_per_ms():
     """Approximate number of cycles per millisecond for torch.cuda._sleep"""
     global _cycles_per_ms
@@ -288,6 +308,7 @@ def get_cycles_per_ms():
         _cycles_per_ms = 1000000 / start.elapsed_time(end)
     return _cycles_per_ms
 
+
 def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
     def tmp(self):
         cpu_tensor = tensor_constructor(t)
@@ -314,6 +335,7 @@ def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
         self.assertEqual(cpu_result, gpu_result, precision)
     return tmp
 
+
 class TestCuda(TestCase):
 
     def test_autogpu(self):
@@ -412,7 +434,7 @@ class TestCuda(TestCase):
         y_cuda = y.cuda(1)
         result = comm.reduce_add((x_cuda, y_cuda))
         self.assertEqual(result.get_device(), 0)
-        self.assertEqual(result.cpu(), x+y)
+        self.assertEqual(result.cpu(), x + y)
 
     def _test_scatter(self, input, chunk_sizes=None, dim=0):
         if torch.cuda.device_count() < 2:
@@ -473,7 +495,7 @@ class TestCuda(TestCase):
         self._test_gather(1)
 
     def test_from_sequence(self):
-        seq = [list(range(i*4,i*4+4)) for i in range(5)]
+        seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
         reference = torch.range(0, 19).resize_(5, 4)
         for t in types:
             cuda_type = get_gpu_type(t)
@@ -526,6 +548,7 @@ class TestCuda(TestCase):
     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_multigpu_serialization_remap(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
+
         def gpu_remap(storage, location):
             if location == 'cuda:1':
                 return storage.cuda(0)
@@ -666,7 +689,8 @@ for decl in tests:
             if not hasattr(tensor, name_inner):
                 continue
             if not hasattr(gpu_tensor, name_inner):
-                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(name_inner, gpu_tensor.__class__.__name__))
+                print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(
+                    name_inner, gpu_tensor.__class__.__name__))
                 continue
 
             test_name = 'test_' + t.__name__ + '_' + name_inner
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index dc5b8bf162..23d5ac6ca6 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -27,11 +27,12 @@ class TestTensorDataset(TestCase):
         l = torch.randn(15)
         source = TensorDataset(t, l)
         for i in range(15):
-            self.assertEqual(t[i:i+1], source[i][0])
-            self.assertEqual(l[i:i+1], source[i][1])
+            self.assertEqual(t[i:i + 1], source[i][0])
+            self.assertEqual(l[i:i + 1], source[i][1])
 
 
 class ErrorDataset(Dataset):
+
     def __init__(self, size):
         self.size = size
 
@@ -50,9 +51,9 @@ class TestDataLoader(TestCase):
         batch_size = loader.batch_size
         for i, (sample, target) in enumerate(loader):
             idx = i * batch_size
-            self.assertEqual(sample, self.data[idx:idx+batch_size])
-            self.assertEqual(target, self.labels[idx:idx+batch_size].view(-1, 1))
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+            self.assertEqual(sample, self.data[idx:idx + batch_size])
+            self.assertEqual(target, self.labels[idx:idx + batch_size].view(-1, 1))
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
 
     def _test_shuffle(self, loader):
         found_data = {i: 0 for i in range(self.data.size(0))}
@@ -67,9 +68,9 @@ class TestDataLoader(TestCase):
                         break
                 self.assertEqual(target, self.labels.narrow(0, data_point_idx, 1))
                 found_labels[data_point_idx] += 1
-            self.assertEqual(sum(found_data.values()), (i+1) * batch_size)
-            self.assertEqual(sum(found_labels.values()), (i+1) * batch_size)
-        self.assertEqual(i, math.floor((len(self.dataset)-1) / batch_size))
+            self.assertEqual(sum(found_data.values()), (i + 1) * batch_size)
+            self.assertEqual(sum(found_labels.values()), (i + 1) * batch_size)
+        self.assertEqual(i, math.floor((len(self.dataset) - 1) / batch_size))
 
     def _test_error(self, loader):
         it = iter(loader)
@@ -81,10 +82,9 @@ class TestDataLoader(TestCase):
                 errors += 1
             except StopIteration:
                 self.assertEqual(errors,
-                    math.ceil(float(len(loader.dataset))/loader.batch_size))
+                                 math.ceil(float(len(loader.dataset)) / loader.batch_size))
                 return
 
-
     def test_sequential(self):
         self._test_sequential(DataLoader(self.dataset))
 
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index d9b0d87615..731421f3f6 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -9,7 +9,9 @@ from common_nn import NNTestCase, ModuleTest, CriterionTest, iter_tensors, \
     module_tests, criterion_tests, TEST_CUDA, PRECISION
 from common import to_gpu, freeze_rng_state, run_tests
 
+
 class OldModuleTest(ModuleTest):
+
     def __init__(self, *args, **kwargs):
         super(OldModuleTest, self).__init__(*args, **kwargs)
         self.check_inplace = kwargs.get('check_inplace', False)
@@ -45,18 +47,18 @@ class OldModuleTest(ModuleTest):
 # TODO: hessian tests
 tests = [
     OldModuleTest(nn.Add,
-                    (torch.Size([5, 4]),),
-                    input_size=(3, 5, 4),
-                    desc='3D'),
+                  (torch.Size([5, 4]),),
+                  input_size=(3, 5, 4),
+                  desc='3D'),
     OldModuleTest(nn.Add,
-                    (1, True),
-                    input_size=(3, 1, 4),
-                    desc='scalar'),
+                  (1, True),
+                  input_size=(3, 1, 4),
+                  desc='scalar'),
     OldModuleTest(nn.AddConstant,
-                    (3.5,),
-                    input_size=(3, 5, 4),
-                    reference_fn=lambda i,_: i + 3.5,
-                    check_inplace=True),
+                  (3.5,),
+                  input_size=(3, 5, 4),
+                  reference_fn=lambda i, _: i + 3.5,
+                  check_inplace=True),
     OldModuleTest(nn.BatchNormalization,
                   (10,),
                   input_size=(4, 10),
@@ -88,435 +90,435 @@ tests = [
                   input_size=(2, 3, 4, 4, 4),
                   desc='no_affine'),
     OldModuleTest(nn.CMul,
-                    (5, 6),
-                    input_size=(10, 5, 6),
-                    desc='3D'),
+                  (5, 6),
+                  input_size=(10, 5, 6),
+                  desc='3D'),
     OldModuleTest(nn.CMul,
-                    (50, 4),
-                    input_size=(1, 50, 4),
-                    desc='3D_single_example'),
+                  (50, 4),
+                  input_size=(1, 50, 4),
+                  desc='3D_single_example'),
     OldModuleTest(nn.CMul,
-                    (1, 5),
-                    input=torch.randn(10, 3, 5)[:,1],
-                    desc='3D_noncontiguous'),
+                  (1, 5),
+                  input=torch.randn(10, 3, 5)[:, 1],
+                  desc='3D_noncontiguous'),
     OldModuleTest(nn.Exp,
-                    input_size=(2, 3, 4),
-                    reference_fn=lambda i,_: i.exp()),
+                  input_size=(2, 3, 4),
+                  reference_fn=lambda i, _: i.exp()),
     OldModuleTest(nn.Log,
-                    input=torch.rand(2, 3, 2) + 0.1,
-                    reference_fn=lambda i,_: i.log()),
+                  input=torch.rand(2, 3, 2) + 0.1,
+                  reference_fn=lambda i, _: i.log()),
     OldModuleTest(nn.Clamp,
-                    (-2., 5.),
-                    input=torch.randn(3, 2, 50) * 6,
-                    reference_fn=lambda i,_: i.clamp(-2, 5)),
+                  (-2., 5.),
+                  input=torch.randn(3, 2, 50) * 6,
+                  reference_fn=lambda i, _: i.clamp(-2, 5)),
     OldModuleTest(nn.Abs,
-                    input_size=(3, 20, 5),
-                    reference_fn=lambda i,_: i.abs()),
+                  input_size=(3, 20, 5),
+                  reference_fn=lambda i, _: i.abs()),
     OldModuleTest(nn.Bilinear,
-                    (2, 3, 10),
-                    input_size=[(4, 2), (4, 3)]),
+                  (2, 3, 10),
+                  input_size=[(4, 2), (4, 3)]),
     OldModuleTest(nn.Bilinear,
-                    (5, 4, 2),
-                    input_size=[(2, 5), (2, 4)],
-                    desc='small_output'),
+                  (5, 4, 2),
+                  input_size=[(2, 5), (2, 4)],
+                  desc='small_output'),
     OldModuleTest(nn.Euclidean,
-                    (5, 7),
-                    input_size=(10, 5)),
+                  (5, 7),
+                  input_size=(10, 5)),
     OldModuleTest(nn.WeightedEuclidean,
-                    (5, 7),
-                    input_size=(10, 5)),
+                  (5, 7),
+                  input_size=(10, 5)),
     OldModuleTest(nn.Cosine,
-                    (5, 7),
-                    input_size=(10, 5)),
+                  (5, 7),
+                  input_size=(10, 5)),
     OldModuleTest(nn.CAddTable,
-                    input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)]),
     OldModuleTest(nn.CSubTable,
-                    input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)]),
     OldModuleTest(nn.CDivTable,
-                    input=[torch.randn(1, 7), torch.rand(1, 7) + 0.1]),
+                  input=[torch.randn(1, 7), torch.rand(1, 7) + 0.1]),
     OldModuleTest(nn.CMulTable,
-                    input_size=[(5, 7), (5, 7)]),
+                  input_size=[(5, 7), (5, 7)]),
     OldModuleTest(nn.Square,
-                    input_size=(10, 2, 4),
-                    reference_fn=lambda i,_: i.mul(i)),
+                  input_size=(10, 2, 4),
+                  reference_fn=lambda i, _: i.mul(i)),
     OldModuleTest(nn.Sqrt,
-                    input=torch.rand(10, 2, 4)+0.01,
-                    reference_fn=lambda i,_: i.sqrt()),
+                  input=torch.rand(10, 2, 4) + 0.01,
+                  reference_fn=lambda i, _: i.sqrt()),
     OldModuleTest(nn.Squeeze,
-                    input_size=(2, 1, 1, 4, 5),
-                    reference_fn=lambda i,_: i.squeeze()),
+                  input_size=(2, 1, 1, 4, 5),
+                  reference_fn=lambda i, _: i.squeeze()),
     OldModuleTest(nn.Squeeze,
-                    (1,),
-                    input_size=(2, 1, 1, 4, 5),
-                    reference_fn=lambda i,_: i.squeeze(1),
-                    desc='dim'),
+                  (1,),
+                  input_size=(2, 1, 1, 4, 5),
+                  reference_fn=lambda i, _: i.squeeze(1),
+                  desc='dim'),
     OldModuleTest(nn.Unsqueeze,
-                    (1,),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.view(2, 1, 4, 5)),
+                  (1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 1, 4, 5)),
     OldModuleTest(nn.Unsqueeze,
-                    (0,),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.view(1, 2, 4, 5),
-                    desc='fist_dim'),
+                  (0,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(1, 2, 4, 5),
+                  desc='fist_dim'),
     OldModuleTest(nn.Unsqueeze,
-                    (3,),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.view(2, 4, 5, 1),
-                    desc='last_dim'),
+                  (3,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 4, 5, 1),
+                  desc='last_dim'),
     OldModuleTest(nn.View,
-                    (-1, 2, 20),
-                    input_size=(2, 2, 4, 5),
-                    reference_fn=lambda i,_: i.view(-1, 2, 20),
-                    desc='infer_batch'),
+                  (-1, 2, 20),
+                  input_size=(2, 2, 4, 5),
+                  reference_fn=lambda i, _: i.view(-1, 2, 20),
+                  desc='infer_batch'),
     OldModuleTest(nn.View,
-                    (2, 2, 2, 5),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.view(2, 2, 2, 5),
-                    desc='split_dim'),
+                  (2, 2, 2, 5),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, 2, 2, 5),
+                  desc='split_dim'),
     OldModuleTest(nn.View,
-                    (2, -1, 2, 5),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.view(2, -1, 2, 5),
-                    desc='infer_middle'),
+                  (2, -1, 2, 5),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.view(2, -1, 2, 5),
+                  desc='infer_middle'),
     OldModuleTest(nn.Sum,
-                    (1,),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.sum(1).squeeze(1)),
+                  (1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.sum(1).squeeze(1)),
     OldModuleTest(nn.Sum,
-                    (1, True),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: i.sum(1).div(i.size(1)).squeeze(1),
-                    desc='sizeAverage'),
+                  (1, True),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: i.sum(1).div(i.size(1)).squeeze(1),
+                  desc='sizeAverage'),
     OldModuleTest(nn.Mean,
-                    (1,),
-                    input_size=(2, 4, 5),
-                    reference_fn=lambda i,_: torch.mean(i, 1).squeeze(1)),
+                  (1,),
+                  input_size=(2, 4, 5),
+                  reference_fn=lambda i, _: torch.mean(i, 1).squeeze(1)),
     OldModuleTest(lambda: nn.Sequential().add(nn.GradientReversal()).add(nn.GradientReversal()),
-                    input_size=(4, 3, 2, 2),
-                    fullname='GradientReversal'),
+                  input_size=(4, 3, 2, 2),
+                  fullname='GradientReversal'),
     OldModuleTest(nn.Identity,
-                    input_size=(4, 3, 2, 4),
-                    reference_fn=lambda i,_: i),
+                  input_size=(4, 3, 2, 4),
+                  reference_fn=lambda i, _: i),
     OldModuleTest(nn.DotProduct,
-                    input_size=[(10, 4), (10, 4)],
-                    reference_fn=lambda i,_: torch.Tensor(list(
-                        a.dot(b) for a, b in zip(i[0], i[1])))
-                    ),
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.Tensor(list(
+                      a.dot(b) for a, b in zip(i[0], i[1])))
+                  ),
     OldModuleTest(nn.CosineDistance,
-                    input_size=[(10, 4), (10, 4)],
-                    reference_fn=lambda i,_: torch.Tensor(list(
-                        a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1])))
-                    ),
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.Tensor(list(
+                      a.dot(b) / (a.norm(2) * b.norm(2)) for a, b in zip(i[0], i[1])))
+                  ),
     OldModuleTest(nn.JoinTable,
-                    (0,),
-                    input_size=[(10, 4), (10, 4)],
-                    reference_fn=lambda i,_: torch.cat(i, 0),
-                    desc='first_dim'),
+                  (0,),
+                  input_size=[(10, 4), (10, 4)],
+                  reference_fn=lambda i, _: torch.cat(i, 0),
+                  desc='first_dim'),
     OldModuleTest(nn.JoinTable,
-                    (2,),
-                    input_size=[(2, 4, 2), (2, 4, 2)],
-                    reference_fn=lambda i,_: torch.cat(i, 2),
-                    desc='positive_dim_index'),
+                  (2,),
+                  input_size=[(2, 4, 2), (2, 4, 2)],
+                  reference_fn=lambda i, _: torch.cat(i, 2),
+                  desc='positive_dim_index'),
     OldModuleTest(nn.JoinTable,
-                    (-1,),
-                    input_size=[(2, 4, 2, 4), (2, 4, 2, 4)],
-                    reference_fn=lambda i,_: torch.cat(i, 3),
-                    desc='negative_dim_index'),
+                  (-1,),
+                  input_size=[(2, 4, 2, 4), (2, 4, 2, 4)],
+                  reference_fn=lambda i, _: torch.cat(i, 3),
+                  desc='negative_dim_index'),
     OldModuleTest(nn.MM,
-                    input_size=[(4, 5, 3), (4, 3, 2)],
-                    reference_fn=lambda i,_: torch.bmm(*i)),
+                  input_size=[(4, 5, 3), (4, 3, 2)],
+                  reference_fn=lambda i, _: torch.bmm(*i)),
     OldModuleTest(nn.MV,
-                    input_size=[(4, 5, 3), (4, 3)],
-                    reference_fn=lambda i,_: torch.bmm(i[0], i[1].view(i[1].size(0), i[1].size(1), 1)).squeeze()),
+                  input_size=[(4, 5, 3), (4, 3)],
+                  reference_fn=lambda i, _: torch.bmm(i[0], i[1].view(i[1].size(0), i[1].size(1), 1)).squeeze()),
     OldModuleTest(nn.Max,
-                    input_size=(4, 5, 3),
-                    reference_fn=lambda i,_: torch.max(i, 0)[0].squeeze()),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.max(i, 0)[0].squeeze()),
     OldModuleTest(nn.Max,
-                    (1,),
-                    input_size=(4, 5, 3),
-                    reference_fn=lambda i,_: torch.max(i, 1)[0].squeeze(),
-                    desc='with_dimension'),
+                  (1,),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.max(i, 1)[0].squeeze(),
+                  desc='with_dimension'),
     OldModuleTest(nn.Min,
-                    input_size=(4, 5, 3),
-                    reference_fn=lambda i,_: torch.min(i, 0)[0].squeeze()),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.min(i, 0)[0].squeeze()),
     OldModuleTest(nn.Min,
-                    (1,),
-                    input_size=(4, 5, 3),
-                    reference_fn=lambda i,_: torch.min(i, 1)[0].squeeze(),
-                    desc='with_dimension'),
+                  (1,),
+                  input_size=(4, 5, 3),
+                  reference_fn=lambda i, _: torch.min(i, 1)[0].squeeze(),
+                  desc='with_dimension'),
     OldModuleTest(nn.MixtureTable,
-                    tuple(),
-                    input_size=[(5, 3), (5, 3, 6)]),
+                  tuple(),
+                  input_size=[(5, 3), (5, 3, 6)]),
     OldModuleTest(nn.LookupTable,
-                    (4, 3),
-                    input=torch.randperm(2).repeat(1, 2),
-                    jacobian_input=False),
+                  (4, 3),
+                  input=torch.randperm(2).repeat(1, 2),
+                  jacobian_input=False),
     OldModuleTest(nn.Mul,
-                    input_size=(2, 3, 4, 2),
-                    reference_fn=lambda i,p: i * p[0][0]),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, p: i * p[0][0]),
     OldModuleTest(nn.MulConstant,
-                    (4,),
-                    input_size=(2, 3, 4, 2),
-                    reference_fn=lambda i,_: i * 4,
-                    check_inplace=True),
+                  (4,),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i * 4,
+                  check_inplace=True),
     OldModuleTest(nn.Narrow,
-                    (0, 0),
-                    input_size=(2, 3, 4, 2),
-                    reference_fn=lambda i,_: i.narrow(0, 0, 1)),
+                  (0, 0),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i.narrow(0, 0, 1)),
     OldModuleTest(nn.Narrow,
-                    (1, 1, 2),
-                    input_size=(2, 3, 4, 2),
-                    reference_fn=lambda i,_: i.narrow(1, 1, 2),
-                    desc='length'),
+                  (1, 1, 2),
+                  input_size=(2, 3, 4, 2),
+                  reference_fn=lambda i, _: i.narrow(1, 1, 2),
+                  desc='length'),
     OldModuleTest(nn.Transpose,
-                    ((1, 2), (1, 3)),
-                    input_size=(2, 3, 4, 5),
-                    reference_fn=lambda i,_: i.transpose(1, 2).transpose(1, 3)),
+                  ((1, 2), (1, 3)),
+                  input_size=(2, 3, 4, 5),
+                  reference_fn=lambda i, _: i.transpose(1, 2).transpose(1, 3)),
     OldModuleTest(nn.Transpose,
-                    ((1, 2),),
-                    input_size=(2, 3, 4, 5),
-                    reference_fn=lambda i,_: i.transpose(1, 2),
-                    desc='single_arg'),
+                  ((1, 2),),
+                  input_size=(2, 3, 4, 5),
+                  reference_fn=lambda i, _: i.transpose(1, 2),
+                  desc='single_arg'),
     # TODO: this seems to be very slow
     OldModuleTest(nn.Replicate,
-                    (2, 1),
-                    input_size=(10, 3, 4, 5),
-                    reference_fn=lambda i,_: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5)),
+                  (2, 1),
+                  input_size=(10, 3, 4, 5),
+                  reference_fn=lambda i, _: i.view(10, 1, 3, 4, 5).expand(10, 2, 3, 4, 5)),
     OldModuleTest(nn.Padding,
-                    (0, 2, -10),
-                    input_size=(2, 3, 4, 5)),
+                  (0, 2, -10),
+                  input_size=(2, 3, 4, 5)),
     OldModuleTest(nn.Padding,
-                    (0, 2, -10, 1),
-                    input_size=(2, 3, 4, 5),
-                    desc='index'),
+                  (0, 2, -10, 1),
+                  input_size=(2, 3, 4, 5),
+                  desc='index'),
     OldModuleTest(nn.Padding,
-                    (0, -2, -10, 1),
-                    input_size=(2, 3, 4, 5),
-                    desc='negative_pad'),
+                  (0, -2, -10, 1),
+                  input_size=(2, 3, 4, 5),
+                  desc='negative_pad'),
     OldModuleTest(nn.PartialLinear,
-                    (5, 6),
-                    input_size=(4, 5)),
+                  (5, 6),
+                  input_size=(4, 5)),
     OldModuleTest(lambda: nn.PartialLinear(5, 6).setPartition(torch.Tensor((2, 4))),
-                    input_size=(4, 5),
-                    fullname='PartialLinear_setPartition'),
+                  input_size=(4, 5),
+                  fullname='PartialLinear_setPartition'),
     OldModuleTest(nn.Power,
-                    (2,),
-                    input_size=(2, 3, 4, 5)),
+                  (2,),
+                  input_size=(2, 3, 4, 5)),
     OldModuleTest(nn.Power,
-                    (1.5,),
-                    input=torch.rand(3, 4, 5),
-                    desc='fractional'),
+                  (1.5,),
+                  input=torch.rand(3, 4, 5),
+                  desc='fractional'),
     OldModuleTest(nn.Reshape,
-                    (4, 5),
-                    input_size=(3, 4*5),
-                    desc='add_dim'),
+                  (4, 5),
+                  input_size=(3, 4 * 5),
+                  desc='add_dim'),
     OldModuleTest(nn.Reshape,
-                    (4*5,),
-                    input_size=(3, 4, 5),
-                    desc='squash_dim'),
+                  (4 * 5,),
+                  input_size=(3, 4, 5),
+                  desc='squash_dim'),
     OldModuleTest(nn.Select,
-                    (1, 2),
-                    input_size=(3, 4, 5),
-                    reference_fn=lambda i,_: i.select(1, 2)),
+                  (1, 2),
+                  input_size=(3, 4, 5),
+                  reference_fn=lambda i, _: i.select(1, 2)),
     OldModuleTest(nn.SelectTable,
-                    (1,),
-                    input_size=[(1,), (2,), (3,), (4,)],
-                    reference_fn=lambda i,_: i[1]),
+                  (1,),
+                  input_size=[(1,), (2,), (3,), (4,)],
+                  reference_fn=lambda i, _: i[1]),
     OldModuleTest(nn.SpatialAveragePooling,
-                    (2, 2),
-                    input_size=(2, 3, 6, 6)),
+                  (2, 2),
+                  input_size=(2, 3, 6, 6)),
     OldModuleTest(nn.SpatialAveragePooling,
-                    (2, 2, 2, 2),
-                    input_size=(2, 3, 6, 6),
-                    desc='stride'),
+                  (2, 2, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride'),
     OldModuleTest(nn.SpatialAveragePooling,
-                    (2, 2, 2, 2, 1, 1),
-                    input_size=(2, 3, 6, 6),
-                    desc='stride_pad'),
+                  (2, 2, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride_pad'),
     OldModuleTest(nn.SpatialAdaptiveMaxPooling,
-                    (4, 4),
-                    input_size=(2, 3, 8, 8),
-                    reference_fn=lambda i,_: nn.SpatialMaxPooling(2, 2).forward(i)),
+                  (4, 4),
+                  input_size=(2, 3, 8, 8),
+                  reference_fn=lambda i, _: nn.SpatialMaxPooling(2, 2).forward(i)),
     OldModuleTest(nn.SpatialAdaptiveMaxPooling,
-                    (4, 4),
-                    input_size=(2, 3, 7, 11),
-                    desc='irregular'),
+                  (4, 4),
+                  input_size=(2, 3, 7, 11),
+                  desc='irregular'),
     OldModuleTest(nn.SpatialConvolution,
-                    (3, 4, 3, 3),
-                    input_size=(2, 3, 6, 6)),
+                  (3, 4, 3, 3),
+                  input_size=(2, 3, 6, 6)),
     OldModuleTest(nn.SpatialConvolution,
-                    (3, 4, 3, 3, 2, 2),
-                    input_size=(2, 3, 6, 6),
-                    desc='strided'),
+                  (3, 4, 3, 3, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='strided'),
     OldModuleTest(nn.SpatialConvolution,
-                    (3, 4, 3, 3, 2, 2, 1, 1),
-                    input_size=(2, 3, 6, 6),
-                    desc='padding'),
+                  (3, 4, 3, 3, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='padding'),
     OldModuleTest(nn.SpatialConvolutionLocal,
-                    (3, 2, 4, 4, 2, 2),
-                    input_size=(1, 3, 4, 4)),
+                  (3, 2, 4, 4, 2, 2),
+                  input_size=(1, 3, 4, 4)),
     OldModuleTest(nn.SpatialConvolutionLocal,
-                    (3, 2, 6, 6, 2, 2, 2, 2),
-                    input_size=(2, 3, 6, 6),
-                    desc='stride'),
+                  (3, 2, 6, 6, 2, 2, 2, 2),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride'),
     OldModuleTest(nn.SpatialConvolutionLocal,
-                    (3, 2, 6, 6, 2, 2, 2, 2, 1, 1),
-                    input_size=(2, 3, 6, 6),
-                    desc='stride_pad'),
+                  (3, 2, 6, 6, 2, 2, 2, 2, 1, 1),
+                  input_size=(2, 3, 6, 6),
+                  desc='stride_pad'),
     OldModuleTest(nn.SpatialDivisiveNormalization,
-                    (3,),
-                    input_size=(2, 3, 8, 8)),
+                  (3,),
+                  input_size=(2, 3, 8, 8)),
     OldModuleTest(nn.SpatialContrastiveNormalization,
-                    (3,),
-                    input_size=(2, 3, 8, 8)),
+                  (3,),
+                  input_size=(2, 3, 8, 8)),
     OldModuleTest(nn.SpatialDilatedConvolution,
-                    (3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
-                    input_size=(2, 3, 8, 8)),
+                  (3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
+                  input_size=(2, 3, 8, 8)),
     OldModuleTest(nn.SpatialDilatedConvolution,
-                    (3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
-                    input_size=(2, 3, 8, 8),
-                    desc='stride_pad'),
+                  (3, 2, 3, 3, 2, 2, 1, 1, 2, 2),
+                  input_size=(2, 3, 8, 8),
+                  desc='stride_pad'),
     OldModuleTest(nn.SpatialMaxPooling,
-                    (3, 3, 2, 2, 1, 1),
-                    input_size=(1, 3, 7, 7)),
+                  (3, 3, 2, 2, 1, 1),
+                  input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialReflectionPadding,
-                    (1, 2, 3, 4),
-                    input_size=(2, 3, 8, 8)),
+                  (1, 2, 3, 4),
+                  input_size=(2, 3, 8, 8)),
     OldModuleTest(nn.SpatialReplicationPadding,
-                    (1, 2, 3, 4),
-                    input_size=(2, 3, 4, 4)),
+                  (1, 2, 3, 4),
+                  input_size=(2, 3, 4, 4)),
     OldModuleTest(nn.SpatialZeroPadding,
-                    (1, 2, 3, 4),
-                    input_size=(2, 3, 4, 4)),
+                  (1, 2, 3, 4),
+                  input_size=(2, 3, 4, 4)),
     OldModuleTest(nn.SpatialConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
-                    input_size=(3, 5, 5),
-                    desc='oneToOne'),
+                  (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne'),
     OldModuleTest(nn.SpatialConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
-                    input_size=(3, 5, 5),
-                    desc='oneToOne_stride'),
+                  (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne_stride'),
     OldModuleTest(nn.SpatialConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
-                    input_size=(3, 5, 5),
-                    desc='full'),
+                  (nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='full'),
     OldModuleTest(nn.SpatialFullConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
-                    input_size=(3, 5, 5),
-                    desc='oneToOne'),
+                  (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne'),
     OldModuleTest(nn.SpatialFullConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
-                    input_size=(3, 5, 5),
-                    desc='oneToOne_stride'),
+                  (nn.SpatialConvolutionMap.maps.oneToOne(3), 3, 3, 2, 2),
+                  input_size=(3, 5, 5),
+                  desc='oneToOne_stride'),
     OldModuleTest(nn.SpatialFullConvolutionMap,
-                    (nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
-                    input_size=(3, 5, 5),
-                    desc='full'),
+                  (nn.SpatialConvolutionMap.maps.full(3, 4), 3, 3),
+                  input_size=(3, 5, 5),
+                  desc='full'),
     # TODO: test CUDA
     OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 0.5, 0.5).fixPoolingRegions(),
-                    input_size=(1, 3, 5, 5),
-                    fullname='SpatialFractionalMaxPooling_ratio',
-                    test_cuda=False),
+                  input_size=(1, 3, 5, 5),
+                  fullname='SpatialFractionalMaxPooling_ratio',
+                  test_cuda=False),
     OldModuleTest(lambda: nn.SpatialFractionalMaxPooling(2, 2, 4, 4).fixPoolingRegions(),
-                    input_size=(1, 3, 7, 7),
-                    fullname='SpatialFractionalMaxPooling_size',
-                    test_cuda=False),
+                  input_size=(1, 3, 7, 7),
+                  fullname='SpatialFractionalMaxPooling_size',
+                  test_cuda=False),
     OldModuleTest(nn.SpatialFullConvolution,
-                    (3, 4, 3, 3, 2, 2, 1, 1, 1, 1),
-                    input_size=(1, 3, 7, 7)),
+                  (3, 4, 3, 3, 2, 2, 1, 1, 1, 1),
+                  input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialLPPooling,
-                    (3, 2, 2, 2, 2, 2),
-                    input_size=(1, 3, 7, 7)),
+                  (3, 2, 2, 2, 2, 2),
+                  input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialSubSampling,
-                    (3, 3, 3, 2, 2),
-                    input_size=(1, 3, 7, 7)),
+                  (3, 3, 3, 2, 2),
+                  input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialSubtractiveNormalization,
-                    (3,),
-                    input_size=(1, 3, 7, 7)),
+                  (3,),
+                  input_size=(1, 3, 7, 7)),
     OldModuleTest(nn.SpatialSubtractiveNormalization,
-                    (3, torch.rand(3)),
-                    input_size=(1, 3, 7, 7),
-                    desc='kernel'),
+                  (3, torch.rand(3)),
+                  input_size=(1, 3, 7, 7),
+                  desc='kernel'),
     OldModuleTest(nn.SpatialUpSamplingNearest,
-                    (2,),
-                    input_size=(1, 3, 4, 4)),
+                  (2,),
+                  input_size=(1, 3, 4, 4)),
 
     OldModuleTest(nn.TemporalConvolution,
-                    (4, 5, 3),
-                    input_size=(2, 10, 4)),
+                  (4, 5, 3),
+                  input_size=(2, 10, 4)),
     OldModuleTest(nn.TemporalConvolution,
-                    (4, 5, 3, 2),
-                    input_size=(2, 10, 4),
-                    desc='stride'),
+                  (4, 5, 3, 2),
+                  input_size=(2, 10, 4),
+                  desc='stride'),
     # TODO: this runs in non-batch mode only
     OldModuleTest(nn.TemporalSubSampling,
-                    (4, 3),
-                    input_size=(10, 4)),
+                  (4, 3),
+                  input_size=(10, 4)),
     OldModuleTest(nn.TemporalSubSampling,
-                    (4, 3, 2),
-                    input_size=(10, 4),
-                    desc='stride'),
+                  (4, 3, 2),
+                  input_size=(10, 4),
+                  desc='stride'),
 
     OldModuleTest(nn.VolumetricAveragePooling,
-                    (2, 2, 2),
-                    input_size=(2, 3, 4, 4, 4)),
+                  (2, 2, 2),
+                  input_size=(2, 3, 4, 4, 4)),
     OldModuleTest(nn.VolumetricAveragePooling,
-                    (2, 2, 2, 2, 2, 2),
-                    input_size=(2, 3, 5, 5, 5),
-                    desc='stride'),
+                  (2, 2, 2, 2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride'),
     OldModuleTest(nn.VolumetricConvolution,
-                    (3, 4, 2, 2, 2),
-                    input_size=(2, 3, 3, 3, 3)),
+                  (3, 4, 2, 2, 2),
+                  input_size=(2, 3, 3, 3, 3)),
     OldModuleTest(nn.VolumetricConvolution,
-                    (3, 4, 2, 2, 2, 2, 2, 2),
-                    input_size=(2, 3, 5, 5, 5),
-                    desc='stride'),
+                  (3, 4, 2, 2, 2, 2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride'),
     OldModuleTest(nn.VolumetricConvolution,
-                    (3, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1),
-                    input_size=(2, 3, 5, 5, 5),
-                    desc='stride_padding'),
+                  (3, 4, 2, 2, 2, 2, 2, 2, 1, 1, 1),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride_padding'),
     OldModuleTest(nn.VolumetricFullConvolution,
-                    (2, 3, 2, 2, 2),
-                    input_size=(1, 2, 4, 4, 4)),
+                  (2, 3, 2, 2, 2),
+                  input_size=(1, 2, 4, 4, 4)),
     OldModuleTest(nn.VolumetricMaxPooling,
-                    (2, 2, 2),
-                    input_size=(2, 3, 5, 5, 5)),
+                  (2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5)),
     OldModuleTest(nn.VolumetricMaxPooling,
-                    (2, 2, 2, 2, 2, 2),
-                    input_size=(2, 3, 5, 5, 5),
-                    desc='stride'),
+                  (2, 2, 2, 2, 2, 2),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride'),
     OldModuleTest(nn.VolumetricMaxPooling,
-                    (2, 2, 2, 2, 2, 2, 1, 1, 1),
-                    input_size=(2, 3, 5, 5, 5),
-                    desc='stride_padding'),
+                  (2, 2, 2, 2, 2, 2, 1, 1, 1),
+                  input_size=(2, 3, 5, 5, 5),
+                  desc='stride_padding'),
     OldModuleTest(nn.VolumetricReplicationPadding,
-                    (1, 2, 3, 4, 5, 6),
-                    input_size=(2, 3, 5, 5, 5)),
+                  (1, 2, 3, 4, 5, 6),
+                  input_size=(2, 3, 5, 5, 5)),
 
     CriterionTest(nn.L1Cost,
-                        input=torch.randn(2, 3, 4, 5),
-                        target=None),
+                  input=torch.randn(2, 3, 4, 5),
+                  target=None),
     CriterionTest(nn.L1HingeEmbeddingCriterion,
-                        input=[torch.randn(2, 3, 4, 5), torch.randn(2, 3, 4, 5)],
-                        target=1),
+                  input=[torch.randn(2, 3, 4, 5), torch.randn(2, 3, 4, 5)],
+                  target=1),
     CriterionTest(nn.L1HingeEmbeddingCriterion,
-                        (2,),
-                        input=[torch.randn(2, 3, 4, 5), torch.randn(2, 3, 4, 5)],
-                        target=1,
-                        desc='margin'),
+                  (2,),
+                  input=[torch.randn(2, 3, 4, 5), torch.randn(2, 3, 4, 5)],
+                  target=1,
+                  desc='margin'),
     CriterionTest(nn.WeightedMSECriterion,
-                        (torch.rand(3, 4, 5),),
-                        input=torch.randn(2, 3, 4, 5),
-                        target=torch.randn(2, 3, 4, 5)),
+                  (torch.rand(3, 4, 5),),
+                  input=torch.randn(2, 3, 4, 5),
+                  target=torch.randn(2, 3, 4, 5)),
     CriterionTest(nn.MarginCriterion,
-                        input_size=(5, 10),
-                        target=torch.randn(5, 10).sign()),
+                  input_size=(5, 10),
+                  target=torch.randn(5, 10).sign()),
     CriterionTest(nn.ClassSimplexCriterion,
-                        (30,),
-                        input=torch.randn(5, 30).mul(10).renorm(2, 0, 1),
-                        target=torch.rand(5).mul(30).floor().long(),
-                        desc='margin'),
+                  (30,),
+                  input=torch.randn(5, 30).mul(10).renorm(2, 0, 1),
+                  target=torch.rand(5).mul(30).floor().long(),
+                  desc='margin'),
 ]
 # TODO: FlattenTable gradient
 # TODO: NarrowTable gradient
@@ -527,30 +529,32 @@ tests = [
 for p in (1, 2, 1.5):
     tests.append(
         OldModuleTest(nn.Normalize,
-                        (p,),
-                        input_size=(4, 5),
-                        # Eh, we need to use p as a default, so it's passed by value
-                        reference_fn=lambda i,_,p=p: i.div(i.norm(p, 1).expand_as(i)),
-                        desc=str(p)),
+                      (p,),
+                      input_size=(4, 5),
+                      # Eh, we need to use p as a default, so it's passed by value
+                      reference_fn=lambda i, _, p=p: i.div(i.norm(p, 1).expand_as(i)),
+                      desc=str(p)),
     )
-for p in range(1, 4+1):
+for p in range(1, 4 + 1):
     tests.append(
         OldModuleTest(nn.PairwiseDistance,
-                        (p,),
-                        input_size=[(4, 10), (4, 10)],
-                        desc=str(p))
+                      (p,),
+                      input_size=[(4, 10), (4, 10)],
+                      desc=str(p))
     )
 
+
 def build_spatial_unpooling_net():
     pool = nn.SpatialMaxPooling(2, 2, 2, 2)
     unpool = nn.SpatialMaxUnpooling(pool)
     return nn.Sequential().add(pool).add(unpool)
 
 tests.append(
-        OldModuleTest(build_spatial_unpooling_net,
-            input_size=(1, 3, 10, 10),
-            desc='SpatialMaxUnpooling')
-        )
+    OldModuleTest(build_spatial_unpooling_net,
+                  input_size=(1, 3, 10, 10),
+                  desc='SpatialMaxUnpooling')
+)
+
 
 def build_volumetric_unpooling_net():
     pool = nn.VolumetricMaxPooling(2, 2, 2, 2)
@@ -558,10 +562,11 @@ def build_volumetric_unpooling_net():
     return nn.Sequential().add(pool).add(unpool)
 
 tests.append(
-        OldModuleTest(build_volumetric_unpooling_net,
-            input_size=(1, 3, 10, 10),
-            desc='VolumetricMaxUnpooling')
-        )
+    OldModuleTest(build_volumetric_unpooling_net,
+                  input_size=(1, 3, 10, 10),
+                  desc='VolumetricMaxUnpooling')
+)
+
 
 def prepare_tests():
     def add_test(test):
@@ -571,8 +576,8 @@ def prepare_tests():
             raise RuntimeError('Found two tests with the same name: ' + test_name)
         if hasattr(TestNN, cuda_test_name):
             raise RuntimeError('Found two tests with the same name: ' + cuda_test_name)
-        setattr(TestNN, test_name, lambda self,test=test: test(self))
-        setattr(TestNN, cuda_test_name, lambda self,test=test: test.test_cuda(self))
+        setattr(TestNN, test_name, lambda self, test=test: test(self))
+        setattr(TestNN, cuda_test_name, lambda self, test=test: test.test_cuda(self))
     name_remap = {
         'Conv2d': 'SpatialConvolution',
         'MaxPool2d': 'SpatialMaxPooling',
@@ -613,6 +618,7 @@ def prepare_tests():
         test = CriterionTest(**test_params)
         add_test(test)
 
+
 class TestNN(NNTestCase):
 
     def _forward(self, module, input):
@@ -636,19 +642,19 @@ class TestNN(NNTestCase):
 
     def test_Dropout(self):
         p = 0.2
-        input = torch.Tensor(1000).fill_(1-p)
+        input = torch.Tensor(1000).fill_(1 - p)
 
         module = nn.Dropout(p)
         output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
         gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1-p)), 0.05)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
 
         module = nn.Dropout(p, True)
         output = module.forward(input.clone())
-        self.assertLess(abs(output.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
         gradInput = module.backward(input.clone(), input.clone())
-        self.assertLess(abs(gradInput.mean() - (1-p)), 0.05)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
 
         # Check that these don't raise errors
         module.__repr__()
@@ -664,9 +670,9 @@ class TestNN(NNTestCase):
         module = nn.SpatialDropout(p)
         module.training()
         output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
         gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1-p)), 0.05)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
 
         # Check that these don't raise errors
         module.__repr__()
@@ -674,18 +680,18 @@ class TestNN(NNTestCase):
 
     def test_VolumetricDropout(self):
         p = 0.2
-        bsz = random.randint(1,5)
-        t = random.randint(1,5)
-        w = random.randint(1,5)
-        h = random.randint(1,5)
+        bsz = random.randint(1, 5)
+        t = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
         nfeats = 1000
         input = torch.Tensor(bsz, nfeats, t, w, h).fill_(1)
         module = nn.VolumetricDropout(p)
         module.training()
         output = module.forward(input)
-        self.assertLess(abs(output.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.mean() - (1 - p)), 0.05)
         gradInput = module.backward(input, input)
-        self.assertLess(abs(gradInput.mean() - (1-p)), 0.05)
+        self.assertLess(abs(gradInput.mean() - (1 - p)), 0.05)
 
         # Check that these don't raise errors
         module.__repr__()
@@ -706,7 +712,7 @@ class TestNN(NNTestCase):
         self.assertTrue(output[input.lt(0)].eq(0).all())
 
     def test_Copy(self):
-        input = torch.randn(3,4).double()
+        input = torch.randn(3, 4).double()
         c = nn.Copy(torch.DoubleTensor, torch.FloatTensor)
         output = c.forward(input)
         self.assertEqual(torch.typename(output), 'torch.FloatTensor')
@@ -833,9 +839,9 @@ class TestNN(NNTestCase):
     def test_ParallelTable(self):
         input = torch.randn(3, 4, 5)
         p = nn.ParallelTable()
-        p.add(nn.View(4,5,1))
-        p.add(nn.View(4,5,1))
-        p.add(nn.View(4,5,1))
+        p.add(nn.View(4, 5, 1))
+        p.add(nn.View(4, 5, 1))
+        p.add(nn.View(4, 5, 1))
         m = nn.Sequential()
         m.add(nn.SplitTable(0))
         m.add(p)
@@ -846,7 +852,7 @@ class TestNN(NNTestCase):
         str(p)
 
         output = m.forward(input)
-        output2 = input.transpose(0,2).transpose(0,1)
+        output2 = input.transpose(0, 2).transpose(0, 1)
         self.assertEqual(output2, output)
 
         gradInput = m.backward(input, output2)
@@ -854,15 +860,15 @@ class TestNN(NNTestCase):
 
     def test_ConcatTable(self):
         input = [
-                torch.randn(3, 4).float(), torch.randn(3, 4).float(), [torch.randn(3, 4).float()]
+            torch.randn(3, 4).float(), torch.randn(3, 4).float(), [torch.randn(3, 4).float()]
         ]
         _gradOutput = [
-                torch.randn(3, 3,4).float(), torch.randn(3, 3,4).float(), torch.randn(3, 3,4).float()
+            torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float(), torch.randn(3, 3, 4).float()
         ]
         gradOutput = [
-                [_gradOutput[0][0], _gradOutput[1][0], [_gradOutput[2][0]]],
-                [_gradOutput[0][1], _gradOutput[1][1], [_gradOutput[2][1]]],
-                [_gradOutput[0][2], _gradOutput[1][2], [_gradOutput[2][2]]]
+            [_gradOutput[0][0], _gradOutput[1][0], [_gradOutput[2][0]]],
+            [_gradOutput[0][1], _gradOutput[1][1], [_gradOutput[2][1]]],
+            [_gradOutput[0][2], _gradOutput[1][2], [_gradOutput[2][2]]]
         ]
         module = nn.ConcatTable()
         module.add(nn.Identity())
@@ -878,7 +884,8 @@ class TestNN(NNTestCase):
         output2 = [input, input, input]
         self.assertEqual(output2, output)
         gradInput = module.backward(input, gradOutput)
-        gradInput2 = [_gradOutput[0].sum(0).squeeze(0), _gradOutput[1].sum(0).squeeze(0), [_gradOutput[2].sum(0).squeeze(0)]]
+        gradInput2 = [_gradOutput[0].sum(0).squeeze(0), _gradOutput[1].sum(
+            0).squeeze(0), [_gradOutput[2].sum(0).squeeze(0)]]
         self.assertTrue(isinstance(gradInput, list))
         self.assertFalse(isinstance(gradInput[0], list))
         self.assertFalse(isinstance(gradInput[1], list))
@@ -910,25 +917,26 @@ class TestNN(NNTestCase):
         input = torch.randn(2, 3, 12, 12)
         gradOutput = torch.randn(2, int(outputSize.sum()), 12, 12)
         concat = nn.DepthConcat(1)
-        concat.add(nn.SpatialConvolution(3, outputSize[0], 1, 1, 1, 1)) #> 2, 5, 12, 12
-        concat.add(nn.SpatialConvolution(3, outputSize[1], 3, 3, 1, 1)) #> 2, 6, 10, 10
-        concat.add(nn.SpatialConvolution(3, outputSize[2], 4, 4, 1, 1)) #> 2, 7, 9, 9
-        concat.add(nn.SpatialConvolution(3, outputSize[3], 5, 5, 1, 1)) #> 2, 8, 8, 8
+        concat.add(nn.SpatialConvolution(3, outputSize[0], 1, 1, 1, 1))  # > 2, 5, 12, 12
+        concat.add(nn.SpatialConvolution(3, outputSize[1], 3, 3, 1, 1))  # > 2, 6, 10, 10
+        concat.add(nn.SpatialConvolution(3, outputSize[2], 4, 4, 1, 1))  # > 2, 7, 9, 9
+        concat.add(nn.SpatialConvolution(3, outputSize[3], 5, 5, 1, 1))  # > 2, 8, 8, 8
         concat.zeroGradParameters()
         # forward/backward
         outputConcat = concat.forward(input)
         gradInputConcat = concat.backward(input, gradOutput)
         # the spatial dims are the largest, the nFilters is the sum
-        output = torch.Tensor(2, int(outputSize.sum()), 12, 12).zero_() # zero for padding
-        narrows = ( (slice(None), slice(0, 5), slice(None), slice(None)), (slice(None), slice(5, 11), slice(1, 11), slice(1, 11)), (slice(None), slice(11, 18), slice(1, 10), slice(1, 10)), (slice(None), slice(18, 26), slice(2, 10), slice(2, 10)) )
+        output = torch.Tensor(2, int(outputSize.sum()), 12, 12).zero_()  # zero for padding
+        narrows = ((slice(None), slice(0, 5), slice(None), slice(None)), (slice(None), slice(5, 11), slice(1, 11), slice(
+            1, 11)), (slice(None), slice(11, 18), slice(1, 10), slice(1, 10)), (slice(None), slice(18, 26), slice(2, 10), slice(2, 10)))
         gradInput = input.clone().zero_()
         for i in range(4):
-           conv = concat.get(i)
-           gradWeight = conv.gradWeight.clone()
-           conv.zeroGradParameters()
-           output[narrows[i]].copy_(conv.forward(input))
-           gradInput.add_(conv.backward(input, gradOutput[narrows[i]]))
-           self.assertEqual(gradWeight, conv.gradWeight)
+            conv = concat.get(i)
+            gradWeight = conv.gradWeight.clone()
+            conv.zeroGradParameters()
+            output[narrows[i]].copy_(conv.forward(input))
+            gradInput.add_(conv.backward(input, gradOutput[narrows[i]]))
+            self.assertEqual(gradWeight, conv.gradWeight)
 
         self.assertEqual(output, outputConcat)
         self.assertEqual(gradInput, gradInputConcat)
@@ -979,7 +987,7 @@ class TestNN(NNTestCase):
         weight = 1
         m = nn.L1Penalty(weight, False, False)
 
-        input = torch.rand(2,10).add_(-0.5)
+        input = torch.rand(2, 10).add_(-0.5)
         input[0][0] = 0
 
         m.forward(input)
@@ -988,7 +996,7 @@ class TestNN(NNTestCase):
         self.assertEqual(input.abs().sum() * weight, m.loss)
 
         true_grad = (input.gt(0).type_as(grad) +
-            input.lt(0).type_as(grad).mul_(-1)).mul_(weight)
+                     input.lt(0).type_as(grad).mul_(-1)).mul_(weight)
         self.assertEqual(true_grad, grad)
 
         # Check that these don't raise errors
@@ -1023,7 +1031,7 @@ class TestNN(NNTestCase):
         mc = nn.MultiCriterion().add(nll, 0.5).add(nll2)
 
         output = mc.forward(input, target)
-        output2 = nll.forward(input, target)/2 + nll2.forward(input, target)
+        output2 = nll.forward(input, target) / 2 + nll2.forward(input, target)
 
         self.assertEqual(output, output2)
         gradInput = mc.backward(input, target)
@@ -1072,7 +1080,7 @@ class TestNN(NNTestCase):
         mse = nn.MSECriterion()
         pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
         output = pc.forward(input, target)
-        output2 = nll.forward(input[0], target[0])/2 + mse.forward(input[1], target[1])
+        output2 = nll.forward(input[0], target[0]) / 2 + mse.forward(input[1], target[1])
         self.assertEqual(output, output2)
         gradInput2 = [nll.backward(input[0], target[0]).clone().div(2), mse.backward(input[1], target[1])]
         gradInput = pc.backward(input, target)
@@ -1096,7 +1104,7 @@ class TestNN(NNTestCase):
         mse = nn.MSECriterion()
         pc = nn.ParallelCriterion(True).add(mse, 0.5).add(nn.MSECriterion())
         output = pc.forward(input, target)
-        output2 = mse.forward(input[0], target)/2 + mse.forward(input[1], target)
+        output2 = mse.forward(input[0], target) / 2 + mse.forward(input[1], target)
         self.assertEqual(output, output2)
         gradInput = pc.backward(input, target)
         gradInput2 = [mse.backward(input[0], target).clone().div(2), mse.backward(input[1], target)]
@@ -1112,11 +1120,12 @@ class TestNN(NNTestCase):
         pc = nn.ParallelCriterion().add(nll, 0.5).add(mse)
         pc2 = nn.ParallelCriterion().add(nll2, 0.4).add(pc)
         output = pc2.forward(input, target)
-        output2 = nll2.forward(input[0], target[0])*0.4 + nll.forward(input[1][0], target[1][0])/2 + mse.forward(input[1][1], target[1][1])
+        output2 = nll2.forward(input[0], target[0]) * 0.4 + nll.forward(input[1][0],
+                                                                        target[1][0]) / 2 + mse.forward(input[1][1], target[1][1])
         self.assertEqual(output, output2)
         gradInput2 = [
-                nll2.backward(input[0], target[0]).clone().mul(0.4),
-                [nll.backward(input[1][1], target[1][0]).clone().div(2), mse.backward(input[1][1], target[1][1])]
+            nll2.backward(input[0], target[0]).clone().mul(0.4),
+            [nll.backward(input[1][1], target[1][0]).clone().div(2), mse.backward(input[1][1], target[1][1])]
         ]
         gradInput = pc2.backward(input, target)
         self.assertEqual(gradInput[0], gradInput2[0])
@@ -1144,11 +1153,11 @@ class TestNN(NNTestCase):
 
     def _build_net(self):
         return (nn.Sequential()
-                    .add(nn.Concat(0)
-                            .add(nn.Linear(2, 5))
-                            .add(nn.Linear(2, 5)))
-                    .add(nn.ReLU())
-                    .add(nn.Linear(10, 20)))
+                .add(nn.Concat(0)
+                     .add(nn.Linear(2, 5))
+                     .add(nn.Linear(2, 5)))
+                .add(nn.ReLU())
+                .add(nn.Linear(10, 20)))
 
     def test_parameters(self):
         net = self._build_net()
@@ -1197,6 +1206,7 @@ class TestNN(NNTestCase):
     def test_apply(self):
         net = self._build_net()
         seen_modules = set()
+
         def callback(module):
             self.assertNotIn(module, seen_modules)
             seen_modules.add(module)
@@ -1206,6 +1216,7 @@ class TestNN(NNTestCase):
     def test_listModules(self):
         net = self._build_net()
         module_list = list()
+
         def callback(module):
             module_list.append(module)
         net.apply(callback)
@@ -1214,6 +1225,7 @@ class TestNN(NNTestCase):
     def test_replace(self):
         ref_net = self._build_net()
         net = self._build_net()
+
         def callback(module):
             if isinstance(module, nn.ReLU):
                 return nn.Tanh()
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 3a0142a97a..1a6e0b3525 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -16,8 +16,8 @@ from common import TestCase, run_tests
 
 HAS_SHM_FILES = os.path.isdir('/dev/shm')
 TEST_CUDA_IPC = torch.cuda.is_available() and \
-                sys.version_info[0] == 3 and \
-                sys.platform != 'darwin'
+    sys.version_info[0] == 3 and \
+    sys.platform != 'darwin'
 
 
 def simple_fill(queue, event):
@@ -74,7 +74,7 @@ def autograd_sharing(queue, ready, master_modified):
     master_modified.wait()
 
     expected_var = torch.range(1, 25).view(5, 5)
-    expected_var[0,0] = 1000
+    expected_var[0, 0] = 1000
     is_ok = var.data.equal(expected_var)
     var.data[:] = torch.ones(5, 5)
 
@@ -189,7 +189,7 @@ class TestMultiprocessing(TestCase):
     def _test_preserve_sharing(self, ctx=mp, repeat=1):
         def do_test():
             x = torch.randn(5, 5)
-            data = [x.storage(), x.storage()[1:4], x, x[2], x[:,1]]
+            data = [x.storage(), x.storage()[1:4], x, x[2], x[:, 1]]
             q = ctx.Queue()
             q.put(data)
             new_data = q.get()
@@ -268,6 +268,7 @@ class TestMultiprocessing(TestCase):
 
     def test_inherit_tensor(self):
         class SubProcess(mp.Process):
+
             def __init__(self, tensor):
                 super(SubProcess, self).__init__()
                 self.tensor = tensor
@@ -286,7 +287,6 @@ class TestMultiprocessing(TestCase):
         torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
         self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
 
-
     @unittest.skipIf(not TEST_CUDA_IPC, 'CUDA IPC not available')
     def test_cuda_small_tensors(self):
         # Check multiple small tensors which will likely use the same
@@ -359,7 +359,7 @@ class TestMultiprocessing(TestCase):
         queue.put(var)
 
         ready.wait()
-        var.data[0,0] = 1000
+        var.data[0, 0] = 1000
         if var.grad is not None:
             var.grad.data[:] = torch.ones(5, 5) * 4
         master_modified.set()
@@ -380,8 +380,8 @@ class TestMultiprocessing(TestCase):
         ]
         for requires_grad, volatile in configs:
             var = Variable(torch.range(1, 25).view(5, 5),
-                            requires_grad=requires_grad,
-                            volatile=volatile)
+                           requires_grad=requires_grad,
+                           volatile=volatile)
             self._test_autograd_sharing(var)
 
     def test_parameter_sharing(self):
diff --git a/test/test_nn.py b/test/test_nn.py
index 0e6db08fe3..e516d2170c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -16,8 +16,10 @@ from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
     module_tests, criterion_tests, TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PRECISION
 from common import freeze_rng_state, run_tests
 
+
 def default_tensor_type(type):
     type_str = torch.typename(type)
+
     def decorator(fn):
         @wraps(fn)
         def wrapper(*args, **kwargs):
@@ -30,9 +32,12 @@ def default_tensor_type(type):
         return wrapper
     return decorator
 
+
 class InputVariableMixin(object):
+
     def _get_input(self):
         input = TestBase._get_input(self)
+
         def map_variables(i):
             if isinstance(i, Variable):
                 return i
@@ -44,6 +49,7 @@ class InputVariableMixin(object):
 
 
 class NewModuleTest(InputVariableMixin, ModuleTest):
+
     def __init__(self, *args, **kwargs):
         super(NewModuleTest, self).__init__(*args, **kwargs)
         self.cudnn = kwargs.get('cudnn', False)
@@ -356,21 +362,21 @@ class TestNN(NNTestCase):
 
     def _test_dropout(self, cls, input):
         p = 0.2
-        input.fill_(1-p)
+        input.fill_(1 - p)
 
         module = cls(p)
         input_var = Variable(input, requires_grad=True)
         output = module(input_var)
-        self.assertLess(abs(output.data.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
         output.backward(input)
-        self.assertLess(abs(input_var.grad.data.mean() - (1-p)), 0.05)
+        self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
 
         module = cls(p, True)
         input_var = Variable(input.clone(), requires_grad=True)
         output = module(input_var + 0)
-        self.assertLess(abs(output.data.mean() - (1-p)), 0.05)
+        self.assertLess(abs(output.data.mean() - (1 - p)), 0.05)
         output.backward(input)
-        self.assertLess(abs(input_var.grad.data.mean() - (1-p)), 0.05)
+        self.assertLess(abs(input_var.grad.data.mean() - (1 - p)), 0.05)
 
         # Check that these don't raise errors
         module.__repr__()
@@ -379,7 +385,9 @@ class TestNN(NNTestCase):
     def test_parameters(self):
         def num_params(module):
             return len(list(module.parameters()))
+
         class Net(nn.Module):
+
             def __init__(self):
                 super(Net, self).__init__()
                 self.l1 = l
@@ -394,6 +402,7 @@ class TestNN(NNTestCase):
 
     def test_modules(self):
         class Net(nn.Module):
+
             def __init__(self):
                 super(Net, self).__init__()
                 self.l1 = l
@@ -455,6 +464,7 @@ class TestNN(NNTestCase):
     def test_non_leaf_parameters(self):
         l1 = nn.Linear(10, 10)
         l2 = nn.Linear(10, 10)
+
         def assign_weight():
             l2.weight = l1.weight + 2
         self.assertRaises(TypeError, assign_weight)
@@ -462,8 +472,8 @@ class TestNN(NNTestCase):
         l2.weight = Parameter(torch.randn(10, 10))
 
     def test_embedding_padding_idx(self):
-        embedding = nn.Embedding(10, 20, padding_idx = 0)
-        input = Variable(torch.LongTensor([[0,2,4,5],[4,3,0,9]]))
+        embedding = nn.Embedding(10, 20, padding_idx=0)
+        input = Variable(torch.LongTensor([[0, 2, 4, 5], [4, 3, 0, 9]]))
         output = embedding(input)
         self.assertEqual(output[0][0].sum().data[0], 0)
         self.assertEqual(output[1][2].sum().data[0], 0)
@@ -493,14 +503,14 @@ class TestNN(NNTestCase):
         def expected_indices(dim):
             if dim == 1:
                 return torch.DoubleTensor([1, 3])
-            lower_dim = expected_indices(dim-1)
+            lower_dim = expected_indices(dim - 1)
             lower_dim = lower_dim.view(1, *lower_dim.size())
-            return torch.cat((lower_dim+4, lower_dim+12), 0)
+            return torch.cat((lower_dim + 4, lower_dim + 12), 0)
 
         def expected_grad(dim):
             if dim == 1:
                 return torch.DoubleTensor([0, 1, 0, 1])
-            lower_dim_grad = expected_grad(dim-1)
+            lower_dim_grad = expected_grad(dim - 1)
             grad = lower_dim_grad.view(1, *lower_dim_grad.size())
             zero = torch.zeros(grad.size())
             return torch.cat((zero, grad, zero, grad), 0)
@@ -671,7 +681,9 @@ class TestNN(NNTestCase):
     def test_data_parallel_nested_output(self):
         def fn(input):
             return [input, (input.sin(), input.cos(), [input.add(1)]), input]
+
         class Net(nn.Module):
+
             def forward(self, input):
                 return fn(input)
         i = Variable(torch.randn(2, 2).float().cuda(1))
@@ -690,7 +702,9 @@ class TestNN(NNTestCase):
     def test_data_parallel_nested_input(self):
         def fn(input):
             return input[1][0]
+
         class Net(nn.Module):
+
             def forward(self, input):
                 return fn(input)
         i = Variable(torch.randn(20, 3).float().cuda(1))
@@ -712,7 +726,7 @@ class TestNN(NNTestCase):
     def test_state_dict(self):
         l = nn.Linear(5, 5)
         block = nn.Module()
-        block.conv=nn.Conv2d(3, 3, 3, bias=False)
+        block.conv = nn.Conv2d(3, 3, 3, bias=False)
         net = nn.Module()
         net.linear1 = l
         net.linear2 = l
@@ -781,6 +795,7 @@ class TestNN(NNTestCase):
 
     def test_parameter_assignment(self):
         l = nn.Linear(5, 5)
+
         def num_params():
             return len(list(l.parameters()))
         self.assertEqual(num_params(), 2)
@@ -814,9 +829,9 @@ class TestNN(NNTestCase):
         # These sizes require huge cuDNN workspaces. Make sure we choose a
         # reasonable algorithm that does not run out of memory
         sizes = [
-          (1, 256, 109, 175),
-          (1, 256, 80, 128),
-          (1, 256, 120, 192),
+            (1, 256, 109, 175),
+            (1, 256, 80, 128),
+            (1, 256, 120, 192),
         ]
         dtype = torch.cuda.FloatTensor
 
@@ -887,7 +902,7 @@ class TestNN(NNTestCase):
         small_t = torch.rand(1, 1, 5, 5)
         for i in range(0, 4, 2):
             for j in range(0, 4, 2):
-                small_t[:,:,i,j] = 100
+                small_t[:, :, i, j] = 100
         output_small, indices_small = m(Variable(small_t))
         for h in range(3, 10):
             for w in range(3, 10):
@@ -900,10 +915,11 @@ class TestNN(NNTestCase):
                     mu(output_small, indices_small, output_size=size)
                 else:
                     self.assertRaises(ValueError, lambda:
-                            mu(output_small, indices_small, (h, w)))
+                                      mu(output_small, indices_small, (h, w)))
 
     def test_container_copy(self):
         class Model(nn.Module):
+
             def __init__(self):
                 super(Model, self).__init__()
                 self.linear = nn.Linear(4, 5)
@@ -955,7 +971,7 @@ class TestNN(NNTestCase):
             for i in range(6):
                 hx, cx = lstm(input, (hx, cx))
 
-            (hx+cx).sum().backward()
+            (hx + cx).sum().backward()
 
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
     @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
@@ -987,9 +1003,9 @@ class TestNN(NNTestCase):
             output, hy = rnn(input, hx)
             # FIXME this is because of a pytorch bug
             if is_lstm:
-                fake_loss = 0*(hy[0] + hy[1]).sum()
+                fake_loss = 0 * (hy[0] + hy[1]).sum()
             else:
-                fake_loss = 0*hy.sum()
+                fake_loss = 0 * hy.sum()
 
             loss = output.sum() + fake_loss
             loss.backward()
@@ -1019,11 +1035,10 @@ class TestNN(NNTestCase):
                 for (cpu_weight, gpu_weight) in zip(cpu_layer_weight, gpu_layer_weight):
                     self.assertEqual(cpu_weight.grad.data, gpu_weight.grad.data, prec=5e-5)
 
-
         for module in (nn.RNN, nn.LSTM, nn.GRU):
             for bias in (True, False):
                 for bidirectional in (False, True):
-                    for dropout in (0, 1): # Because of dropout randomness, can only compare 0 and 1
+                    for dropout in (0, 1):  # Because of dropout randomness, can only compare 0 and 1
                         for batch_first in (False, True):
                             num_directions = 2 if bidirectional else 1
                             if batch_first:
@@ -1038,7 +1053,7 @@ class TestNN(NNTestCase):
                                          bias=bias,
                                          dropout=dropout,
                                          bidirectional=bidirectional,
-                                         batch_first = batch_first)
+                                         batch_first=batch_first)
 
                             outputs_cpu = forward_backward(
                                 False, rnn, input_val, hx_val, rnn.all_weights)
@@ -1049,7 +1064,7 @@ class TestNN(NNTestCase):
                                              bias=bias,
                                              dropout=dropout,
                                              bidirectional=bidirectional,
-                                             batch_first = batch_first)
+                                             batch_first=batch_first)
 
                             outputs_gpu = forward_backward(
                                 True, rnn_gpu, input_val, hx_val, rnn.all_weights)
@@ -1087,8 +1102,8 @@ class TestNN(NNTestCase):
                     rnn.weight_hh_l0.data.fill_(1)
                     rnn.weight_ih_l1.data.fill_(1)
                     rnn.weight_hh_l1.data.fill_(1)
-                    input = Variable(torch.Tensor(1,1,10).fill_(1))
-                    hx = Variable(torch.Tensor(2,1,1000).fill_(0))
+                    input = Variable(torch.Tensor(1, 1, 10).fill_(1))
+                    hx = Variable(torch.Tensor(2, 1, 1000).fill_(0))
                     if cuda:
                         input = input.cuda()
                         hx = hx.cuda()
@@ -1129,8 +1144,8 @@ class TestNN(NNTestCase):
                         rnn.train()
                     else:
                         rnn.eval()
-                    input = Variable(torch.Tensor(1,1,100).uniform_())
-                    hx = Variable(torch.Tensor(2,1,100).uniform_())
+                    input = Variable(torch.Tensor(1, 1, 100).uniform_())
+                    hx = Variable(torch.Tensor(2, 1, 100).uniform_())
                     if cuda:
                         input = input.cuda()
                         hx = hx.cuda()
@@ -1185,8 +1200,8 @@ class TestNN(NNTestCase):
             module = nn.BatchNorm1d(3).type(tp)
             module.eval()
 
-            data = Variable(torch.rand(4,3).type(tp), requires_grad=True)
-            grad = torch.rand(4,3).type(tp)
+            data = Variable(torch.rand(4, 3).type(tp), requires_grad=True)
+            grad = torch.rand(4, 3).type(tp)
 
             # 1st pass
             res1 = module(data)
@@ -1210,8 +1225,8 @@ def add_test(test):
         raise RuntimeError('Found two tests with the same name: ' + test_name)
     if hasattr(TestNN, cuda_test_name):
         raise RuntimeError('Found two tests with the same name: ' + cuda_test_name)
-    setattr(TestNN, test_name, lambda self,test=test: test(self))
-    setattr(TestNN, cuda_test_name, lambda self,test=test: test.test_cuda(self))
+    setattr(TestNN, test_name, lambda self, test=test: test(self))
+    setattr(TestNN, cuda_test_name, lambda self, test=test: test.test_cuda(self))
 
 
 new_module_tests = [
@@ -1528,13 +1543,15 @@ new_module_tests = [
         jacobian_input=False
     ),
     dict(
-        constructor=lambda: nn.FractionalMaxPool2d(2, output_ratio=0.5, _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
+        constructor=lambda: nn.FractionalMaxPool2d(
+            2, output_ratio=0.5, _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
         input_size=(1, 3, 5, 5),
         fullname='FractionalMaxPool2d_ratio',
         test_cuda=False
     ),
     dict(
-        constructor=lambda: nn.FractionalMaxPool2d((2, 2), output_size=(4, 4), _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
+        constructor=lambda: nn.FractionalMaxPool2d((2, 2), output_size=(
+            4, 4), _random_samples=torch.DoubleTensor(1, 3, 2).uniform_()),
         input_size=(1, 3, 7, 7),
         fullname='FractionalMaxPool2d_size',
         test_cuda=False
@@ -1596,6 +1613,7 @@ for test_params in criterion_tests:
 
 
 class UnpoolingNet(nn.Module):
+
     def __init__(self, pool, unpool):
         super(UnpoolingNet, self).__init__()
         self.pool = pool
diff --git a/test/test_optim.py b/test/test_optim.py
index 003e47cef7..3c8dfd9105 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -53,7 +53,7 @@ class TestOptim(TestCase):
         for i in range(2000):
             optimizer.step(eval)
             old_fn(lambda _: (rosenbrock(params_t), drosenbrock(params_t)),
-                    params_t, state)
+                   params_t, state)
             self.assertEqual(params.data, params_t)
 
         self.assertLessEqual(params.data.dist(solution), initial_dist)
@@ -128,8 +128,8 @@ class TestOptim(TestCase):
         )
         # non-contiguous parameters
         self._test_basic_cases_template(
-            torch.randn(10, 5, 2)[...,0],
-            torch.randn(10, 2)[...,0],
+            torch.randn(10, 5, 2)[..., 0],
+            torch.randn(10, 2)[..., 0],
             torch.randn(5),
             constructor
         )
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 1b0a9c52bb..11b51eaf3f 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -11,6 +11,7 @@ SparseTensor = sparse.DoubleTensor
 
 
 class TestSparse(TestCase):
+
     @staticmethod
     def _gen_sparse(d, nnz, with_size):
         v = torch.randn(nnz)
@@ -19,7 +20,7 @@ class TestSparse(TestCase):
             x = SparseTensor(i, v)
         else:
             i = torch.rand(d, nnz) * \
-                    torch.Tensor(with_size).repeat(nnz, 1).transpose(0, 1)
+                torch.Tensor(with_size).repeat(nnz, 1).transpose(0, 1)
             i = i.type(torch.LongTensor)
             x = SparseTensor(i, v, torch.Size(with_size))
 
@@ -74,13 +75,13 @@ class TestSparse(TestCase):
 
     def test_contig(self):
         i = torch.LongTensor([
-            [1,  0, 35, 14, 39,  6, 71, 66, 40, 27],
+            [1, 0, 35, 14, 39, 6, 71, 66, 40, 27],
             [92, 31, 62, 50, 22, 65, 89, 74, 56, 34],
         ])
         v = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
         x = SparseTensor(i, v, torch.Size([100, 100]))
         exp_i = torch.LongTensor([
-            [0,  1,  6, 14, 27, 35, 39, 40, 66, 71],
+            [0, 1, 6, 14, 27, 35, 39, 40, 66, 71],
             [31, 92, 65, 50, 34, 62, 22, 56, 74, 89],
         ])
         exp_v = torch.Tensor([2, 1, 6, 4, 10, 3, 5, 9, 8, 7])
diff --git a/test/test_torch.py b/test/test_torch.py
index 4a0591222e..0d92819b3c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -15,6 +15,7 @@ if TEST_NUMPY:
 
 SIZE = 100
 
+
 def skipIfNoLapack(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -26,6 +27,7 @@ def skipIfNoLapack(fn):
             raise
     return wrapper
 
+
 class TestTorch(TestCase):
 
     def test_dot(self):
@@ -36,7 +38,7 @@ class TestTorch(TestCase):
         for tname, prec in types.items():
             v1 = torch.randn(100).type(tname)
             v2 = torch.randn(100).type(tname)
-            res1 = torch.dot(v1,v2)
+            res1 = torch.dot(v1, v2)
             res2 = 0
             for i, j in zip(v1, v2):
                 res2 += i * j
@@ -54,9 +56,9 @@ class TestTorch(TestCase):
 
         # non-contiguous
         m1 = torch.randn(*size)
-        res1 = torchfn(m1[:,4])
+        res1 = torchfn(m1[:, 4])
         res2 = res1.clone().zero_()
-        for i, v in enumerate(m1[:,4]):
+        for i, v in enumerate(m1[:, 4]):
             res2[i] = mathfn(v)
         self.assertEqual(res1, res2)
 
@@ -112,7 +114,7 @@ class TestTorch(TestCase):
 
     def test_sigmoid(self):
         # TODO: why not simulate math.sigmoid like with rsqrt?
-        inputValues = [-1000,-1,0,0.5,1,2,1000]
+        inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000]
         expectedOutput = [0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000]
         precision_4dps = 0.0002
 
@@ -145,31 +147,31 @@ class TestTorch(TestCase):
 
     def _testSelection(self, torchfn, mathfn):
         # contiguous
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         res1 = torchfn(m1)
-        res2 = m1[0,0]
+        res2 = m1[0, 0]
         for i, j in iter_indices(m1):
-            res2 = mathfn(res2, m1[i,j])
+            res2 = mathfn(res2, m1[i, j])
         self.assertEqual(res1, res2)
 
         # non-contiguous
-        m1 = torch.randn(10,10,10)
-        m2 = m1[:,4]
+        m1 = torch.randn(10, 10, 10)
+        m2 = m1[:, 4]
         res1 = torchfn(m2)
-        res2 = m2[0,0]
+        res2 = m2[0, 0]
         for i, j in iter_indices(m2):
             res2 = mathfn(res2, m2[i][j])
         self.assertEqual(res1, res2)
 
         # with indices
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         res1val, res1ind = torchfn(m1, 1)
-        res2val = m1[:,0:1].clone()
+        res2val = m1[:, 0:1].clone()
         res2ind = res1ind.clone().fill_(0)
         for i, j in iter_indices(m1):
-            if mathfn(res2val[i,0], m1[i,j]) != res2val[i,0]:
-                res2val[i,0] = m1[i,j]
-                res2ind[i,0] = j
+            if mathfn(res2val[i, 0], m1[i, j]) != res2val[i, 0]:
+                res2val[i, 0] = m1[i, j]
+                res2ind[i, 0] = j
 
         maxerr = 0
         for i in range(res1val.size(0)):
@@ -211,7 +213,7 @@ class TestTorch(TestCase):
 
     def test_lerp(self):
         def TH_lerp(a, b, weight):
-            return a + weight * (b-a);
+            return a + weight * (b - a)
 
         size = (100, 100)
         a = torch.rand(*size)
@@ -244,10 +246,10 @@ class TestTorch(TestCase):
         test((5, 5))
 
     def test_mv(self):
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         v1 = torch.randn(100)
 
-        res1 = torch.mv(m1,v1)
+        res1 = torch.mv(m1, v1)
         res2 = res1.clone().zero_()
         for i, j in iter_indices(m1):
             res2[i] += m1[i][j] * v1[j]
@@ -256,51 +258,51 @@ class TestTorch(TestCase):
 
     def test_add(self):
         # [res] torch.add([res,] tensor1, tensor2)
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         v1 = torch.randn(100)
 
         # contiguous
         res1 = torch.add(m1[4], v1)
         res2 = res1.clone().zero_()
         for i in range(m1.size(1)):
-            res2[i] = m1[4,i] + v1[i]
+            res2[i] = m1[4, i] + v1[i]
         self.assertEqual(res1, res2)
 
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         v1 = torch.randn(100)
 
         # non-contiguous
-        res1 = torch.add(m1[:,4],v1)
+        res1 = torch.add(m1[:, 4], v1)
         res2 = res1.clone().zero_()
         for i in range(m1.size(0)):
-            res2[i] = m1[i,4] + v1[i]
+            res2[i] = m1[i, 4] + v1[i]
         self.assertEqual(res1, res2)
 
         # [res] torch.add([res,] tensor, value)
-        m1 = torch.randn(10,10)
+        m1 = torch.randn(10, 10)
 
         # contiguous
         res1 = m1.clone()
         res1[3].add_(2)
         res2 = m1.clone()
         for i in range(m1.size(1)):
-            res2[3,i] = res2[3,i] + 2
+            res2[3, i] = res2[3, i] + 2
         self.assertEqual(res1, res2)
 
         # non-contiguous
-        m1 = torch.randn(10,10)
+        m1 = torch.randn(10, 10)
         res1 = m1.clone()
-        res1[:,3].add_(2)
+        res1[:, 3].add_(2)
         res2 = m1.clone()
         for i in range(m1.size(0)):
-            res2[i,3] = res2[i,3] + 2
+            res2[i, 3] = res2[i, 3] + 2
         self.assertEqual(res1, res2)
 
         # [res] torch.add([res,] tensor1, value, tensor2)
 
     def test_csub(self):
         # with a tensor
-        a = torch.randn(100,90)
+        a = torch.randn(100, 90)
         b = a.clone().normal_()
 
         res_add = torch.add(a, -1, b)
@@ -309,7 +311,7 @@ class TestTorch(TestCase):
         self.assertEqual(res_add, res_csub)
 
         # with a scalar
-        a = torch.randn(100,100)
+        a = torch.randn(100, 100)
 
         scalar = 123.5
         res_add = torch.add(a, -scalar)
@@ -318,7 +320,7 @@ class TestTorch(TestCase):
         self.assertEqual(res_add, res_csub)
 
     def test_neg(self):
-        a = torch.randn(100,90)
+        a = torch.randn(100, 90)
         zeros = torch.Tensor().resize_as_(a).zero_()
 
         res_add = torch.add(zeros, -1, a)
@@ -327,7 +329,7 @@ class TestTorch(TestCase):
         self.assertEqual(res_neg, res_add)
 
     def test_reciprocal(self):
-        a = torch.randn(100,89)
+        a = torch.randn(100, 89)
         zeros = torch.Tensor().resize_as_(a).zero_()
 
         res_pow = torch.pow(a, -1)
@@ -336,97 +338,97 @@ class TestTorch(TestCase):
         self.assertEqual(res_reciprocal, res_pow)
 
     def test_mul(self):
-        m1 = torch.randn(10,10)
+        m1 = torch.randn(10, 10)
         res1 = m1.clone()
-        res1[:,3].mul_(2)
+        res1[:, 3].mul_(2)
         res2 = m1.clone()
         for i in range(res1.size(0)):
-            res2[i,3] = res2[i,3] * 2
+            res2[i, 3] = res2[i, 3] * 2
         self.assertEqual(res1, res2)
 
     def test_div(self):
-        m1 = torch.randn(10,10)
+        m1 = torch.randn(10, 10)
         res1 = m1.clone()
-        res1[:,3].div_(2)
+        res1[:, 3].div_(2)
         res2 = m1.clone()
         for i in range(m1.size(0)):
-            res2[i,3] = res2[i,3] / 2
+            res2[i, 3] = res2[i, 3] / 2
         self.assertEqual(res1, res2)
 
     def test_fmod(self):
-        m1 = torch.Tensor(10,10).uniform_(-10., 10.)
+        m1 = torch.Tensor(10, 10).uniform_(-10., 10.)
         res1 = m1.clone()
         q = 2.1
-        res1[:,3].fmod_(q)
+        res1[:, 3].fmod_(q)
         res2 = m1.clone()
         for i in range(m1.size(1)):
-            res2[i,3] = math.fmod(res2[i,3], q)
+            res2[i, 3] = math.fmod(res2[i, 3], q)
         self.assertEqual(res1, res2)
 
     def test_remainder(self):
         m1 = torch.Tensor(10, 10).uniform_(-10., 10.)
         res1 = m1.clone()
         q = 2.1
-        res1[:,3].remainder_(q)
+        res1[:, 3].remainder_(q)
         res2 = m1.clone()
         for i in range(m1.size(0)):
-            res2[i,3] = res2[i,3] % q
+            res2[i, 3] = res2[i, 3] % q
         self.assertEqual(res1, res2)
 
     def test_mm(self):
         # helper function
-        def matrixmultiply(mat1,mat2):
+        def matrixmultiply(mat1, mat2):
             n = mat1.size(0)
             m = mat1.size(1)
             p = mat2.size(1)
-            res = torch.zeros(n,p)
+            res = torch.zeros(n, p)
             for i, j in iter_indices(res):
-                res[i,j] = sum(mat1[i,k] * mat2[k,j] for k in range(m))
+                res[i, j] = sum(mat1[i, k] * mat2[k, j] for k in range(m))
             return res
 
         # contiguous case
         n, m, p = 10, 10, 5
-        mat1 = torch.randn(n,m)
-        mat2 = torch.randn(m,p)
-        res = torch.mm(mat1,mat2)
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(m, p)
+        res = torch.mm(mat1, mat2)
 
-        res2 = matrixmultiply(mat1,mat2)
+        res2 = matrixmultiply(mat1, mat2)
         self.assertEqual(res, res2)
 
         # non contiguous case 1
         n, m, p = 10, 10, 5
-        mat1 = torch.randn(n,m)
-        mat2 = torch.randn(p,m).t()
-        res = torch.mm(mat1,mat2)
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(p, m).t()
+        res = torch.mm(mat1, mat2)
 
-        res2 = matrixmultiply(mat1,mat2)
+        res2 = matrixmultiply(mat1, mat2)
         self.assertEqual(res, res2)
 
         # non contiguous case 2
         n, m, p = 10, 10, 5
-        mat1 = torch.randn(m,n).t()
-        mat2 = torch.randn(m,p)
-        res = torch.mm(mat1,mat2)
+        mat1 = torch.randn(m, n).t()
+        mat2 = torch.randn(m, p)
+        res = torch.mm(mat1, mat2)
 
-        res2 = matrixmultiply(mat1,mat2)
+        res2 = matrixmultiply(mat1, mat2)
         self.assertEqual(res, res2)
 
         # non contiguous case 3
         n, m, p = 10, 10, 5
-        mat1 = torch.randn(m,n).t()
-        mat2 = torch.randn(p,m).t()
-        res = torch.mm(mat1,mat2)
+        mat1 = torch.randn(m, n).t()
+        mat2 = torch.randn(p, m).t()
+        res = torch.mm(mat1, mat2)
 
-        res2 = matrixmultiply(mat1,mat2)
+        res2 = matrixmultiply(mat1, mat2)
         self.assertEqual(res, res2)
 
         # test with zero stride
         n, m, p = 10, 10, 5
-        mat1 = torch.randn(n,m)
-        mat2 = torch.randn(m,1).expand(m,p)
-        res = torch.mm(mat1,mat2)
+        mat1 = torch.randn(n, m)
+        mat2 = torch.randn(m, 1).expand(m, p)
+        res = torch.mm(mat1, mat2)
 
-        res2 = matrixmultiply(mat1,mat2)
+        res2 = matrixmultiply(mat1, mat2)
         self.assertEqual(res, res2)
 
     def test_bmm(self):
@@ -449,25 +451,25 @@ class TestTorch(TestCase):
         res = torch.bmm(b1, b2)
         res2 = torch.Tensor().resize_as_(res[0]).zero_()
 
-        res2.addbmm_(b1,b2)
+        res2.addbmm_(b1, b2)
         self.assertEqual(res2, res.sum(0)[0])
 
-        res2.addbmm_(1,b1,b2)
-        self.assertEqual(res2, res.sum(0)[0]*2)
+        res2.addbmm_(1, b1, b2)
+        self.assertEqual(res2, res.sum(0)[0] * 2)
 
-        res2.addbmm_(1.,.5,b1,b2)
-        self.assertEqual(res2, res.sum(0)[0]*2.5)
+        res2.addbmm_(1., .5, b1, b2)
+        self.assertEqual(res2, res.sum(0)[0] * 2.5)
 
-        res3 = torch.addbmm(1,res2,0,b1,b2)
+        res3 = torch.addbmm(1, res2, 0, b1, b2)
         self.assertEqual(res3, res2)
 
-        res4 = torch.addbmm(1,res2,.5,b1,b2)
-        self.assertEqual(res4, res.sum(0)[0]*3)
+        res4 = torch.addbmm(1, res2, .5, b1, b2)
+        self.assertEqual(res4, res.sum(0)[0] * 3)
 
-        res5 = torch.addbmm(0,res2,1,b1,b2)
+        res5 = torch.addbmm(0, res2, 1, b1, b2)
         self.assertEqual(res5, res.sum(0)[0])
 
-        res6 = torch.addbmm(.1,res2,.5,b1,b2)
+        res6 = torch.addbmm(.1, res2, .5, b1, b2)
         self.assertEqual(res6, res2 * .1 + res.sum(0) * .5)
 
     def test_baddbmm(self):
@@ -478,25 +480,25 @@ class TestTorch(TestCase):
         res = torch.bmm(b1, b2)
         res2 = torch.Tensor().resize_as_(res).zero_()
 
-        res2.baddbmm_(b1,b2)
+        res2.baddbmm_(b1, b2)
         self.assertEqual(res2, res)
 
-        res2.baddbmm_(1,b1,b2)
-        self.assertEqual(res2, res*2)
+        res2.baddbmm_(1, b1, b2)
+        self.assertEqual(res2, res * 2)
 
-        res2.baddbmm_(1,.5,b1,b2)
-        self.assertEqual(res2, res*2.5)
+        res2.baddbmm_(1, .5, b1, b2)
+        self.assertEqual(res2, res * 2.5)
 
-        res3 = torch.baddbmm(1,res2,0,b1,b2)
+        res3 = torch.baddbmm(1, res2, 0, b1, b2)
         self.assertEqual(res3, res2)
 
-        res4 = torch.baddbmm(1,res2,.5,b1,b2)
-        self.assertEqual(res4, res*3)
+        res4 = torch.baddbmm(1, res2, .5, b1, b2)
+        self.assertEqual(res4, res * 3)
 
-        res5 = torch.baddbmm(0,res2,1,b1,b2)
+        res5 = torch.baddbmm(0, res2, 1, b1, b2)
         self.assertEqual(res5, res)
 
-        res6 = torch.baddbmm(.1,res2,.5,b1,b2)
+        res6 = torch.baddbmm(.1, res2, .5, b1, b2)
         self.assertEqual(res6, res2 * .1 + res * .5)
 
     def test_clamp(self):
@@ -531,7 +533,7 @@ class TestTorch(TestCase):
 
         # base - tensor, exponent - number
         # contiguous
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         res1 = torch.pow(m1[4], 3)
         res2 = res1.clone().zero_()
         for i in range(res2.size(0)):
@@ -539,25 +541,25 @@ class TestTorch(TestCase):
         self.assertEqual(res1, res2)
 
         # non-contiguous
-        m1 = torch.randn(100,100)
-        res1 = torch.pow(m1[:,4], 3)
+        m1 = torch.randn(100, 100)
+        res1 = torch.pow(m1[:, 4], 3)
         res2 = res1.clone().zero_()
         for i in range(res2.size(0)):
-            res2[i] = math.pow(m1[i,4], 3)
+            res2[i] = math.pow(m1[i, 4], 3)
         self.assertEqual(res1, res2)
 
         # base - number, exponent - tensor
         # contiguous
-        m1 = torch.randn(100,100)
+        m1 = torch.randn(100, 100)
         res1 = torch.pow(3, m1[4])
         res2 = res1.clone().zero_()
         for i in range(res2.size(0)):
-            res2[i] = math.pow(3, m1[4,i])
+            res2[i] = math.pow(3, m1[4, i])
         self.assertEqual(res1, res2)
 
         # non-contiguous
-        m1 = torch.randn(100,100)
-        res1 = torch.pow(3, m1[:,4])
+        m1 = torch.randn(100, 100)
+        res1 = torch.pow(3, m1[:, 4])
         res2 = res1.clone().zero_()
         for i in range(res2.size(0)):
             res2[i] = math.pow(3, m1[i][4])
@@ -567,7 +569,7 @@ class TestTorch(TestCase):
         def reference_implementation(res2):
             for i, j in iter_indices(sm1):
                 idx1d = i * sm1.size(0) + j
-                res2[i,j] = mathfn(sm1[i,j], sm2[idx1d])
+                res2[i, j] = mathfn(sm1[i, j], sm2[idx1d])
             return res2
 
         # contiguous
@@ -582,8 +584,8 @@ class TestTorch(TestCase):
         # non-contiguous
         m1 = torch.randn(10, 10, 10)
         m2 = torch.randn(10 * 10, 10 * 10)
-        sm1 = m1[:,4]
-        sm2 = m2[:,4]
+        sm1 = m1[:, 4]
+        sm2 = m2[:, 4]
         res1 = torchfn(sm1, sm2)
         res2 = reference_implementation(res1.clone())
         self.assertEqual(res1, res2)
@@ -649,7 +651,7 @@ class TestTorch(TestCase):
 
     def test_histc(self):
         x = torch.Tensor((2, 4, 2, 2, 5, 4))
-        y = torch.histc(x, 5, 1, 5) # nbins,  min,  max
+        y = torch.histc(x, 5, 1, 5)  # nbins,  min,  max
         z = torch.Tensor((0, 3, 0, 2, 1))
         self.assertEqual(y, z)
 
@@ -673,7 +675,7 @@ class TestTorch(TestCase):
         self.assertEqual(res1, res2)
 
     def test_renorm(self):
-        m1 = torch.randn(10,5)
+        m1 = torch.randn(10, 5)
         res1 = torch.Tensor()
 
         def renorm(matrix, value, dim, max_norm):
@@ -708,9 +710,9 @@ class TestTorch(TestCase):
     def test_multinomial(self):
         # with replacement
         n_row = 3
-        for n_col in range(4, 5+1):
+        for n_col in range(4, 5 + 1):
             prob_dist = torch.rand(n_row, n_col)
-            prob_dist.select(1, n_col-1).fill_(0) #index n_col shouldn't be sampled
+            prob_dist.select(1, n_col - 1).fill_(0)  # index n_col shouldn't be sampled
             n_sample = n_col
             sample_indices = torch.multinomial(prob_dist, n_sample, True)
             self.assertEqual(prob_dist.dim(), 2)
@@ -720,9 +722,9 @@ class TestTorch(TestCase):
 
         # without replacement
         n_row = 3
-        for n_col in range(4, 5+1):
+        for n_col in range(4, 5 + 1):
             prob_dist = torch.rand(n_row, n_col)
-            prob_dist.select(1, n_col-1).fill_(0) #index n_col shouldn't be sampled
+            prob_dist.select(1, n_col - 1).fill_(0)  # index n_col shouldn't be sampled
             n_sample = 3
             sample_indices = torch.multinomial(prob_dist, n_sample, False)
             self.assertEqual(prob_dist.dim(), 2)
@@ -730,9 +732,9 @@ class TestTorch(TestCase):
             for i in range(n_row):
                 row_samples = {}
                 for j in range(n_sample):
-                    sample_idx = sample_indices[i,j]
-                    self.assertNotEqual(sample_idx, n_col-1,
-                            "sampled an index with zero probability")
+                    sample_idx = sample_indices[i, j]
+                    self.assertNotEqual(sample_idx, n_col - 1,
+                                        "sampled an index with zero probability")
                     self.assertNotIn(sample_idx, row_samples, "sampled an index twice")
                     row_samples[sample_idx] = True
 
@@ -803,17 +805,17 @@ class TestTorch(TestCase):
 
         are_ordered = True
         for j, k in product(range(SIZE), range(1, SIZE)):
-            self.assertTrue(check_order(mxx[j][k-1], mxx[j][k]),
-                    'torch.sort ({}) values unordered for {}'.format(order, task))
+            self.assertTrue(check_order(mxx[j][k - 1], mxx[j][k]),
+                            'torch.sort ({}) values unordered for {}'.format(order, task))
 
         seen = set()
         indicesCorrect = True
-        size = x.size(x.dim()-1)
+        size = x.size(x.dim() - 1)
         for k in range(size):
             seen.clear()
             for j in range(size):
                 self.assertEqual(x[k][ixx[k][j]], mxx[k][j],
-                        'torch.sort ({}) indices wrong for {}'.format(order, task))
+                                 'torch.sort ({}) indices wrong for {}'.format(order, task))
                 seen.add(ixx[k][j])
             self.assertEqual(len(seen), size)
 
@@ -840,18 +842,18 @@ class TestTorch(TestCase):
         )
 
         # Test that we still have proper sorting with duplicate keys
-        x = torch.floor(torch.rand(SIZE, SIZE)*10)
+        x = torch.floor(torch.rand(SIZE, SIZE) * 10)
         torch.sort(x, out=(res2val, res2ind))
         self.assertIsOrdered('ascending', x, res2val, res2ind, 'random with duplicate keys')
 
         # DESCENDING SORT
         x = torch.rand(SIZE, SIZE)
-        res1val, res1ind = torch.sort(x, x.dim()-1, True)
+        res1val, res1ind = torch.sort(x, x.dim() - 1, True)
 
         # Test use of result tensor
         res2val = torch.Tensor()
         res2ind = torch.LongTensor()
-        torch.sort(x, x.dim()-1, True, out=(res2val, res2ind))
+        torch.sort(x, x.dim() - 1, True, out=(res2val, res2ind))
         self.assertEqual(res1val, res2val, 0)
         self.assertEqual(res1ind, res2ind, 0)
 
@@ -892,8 +894,8 @@ class TestTorch(TestCase):
             compareTensors(t, sortKVal, sortKInd, topKVal, topKInd, dim)
 
         t = torch.rand(random.randint(1, SIZE),
-                        random.randint(1, SIZE),
-                        random.randint(1, SIZE))
+                       random.randint(1, SIZE),
+                       random.randint(1, SIZE))
 
         for kTries in range(3):
             for dimTries in range(3):
@@ -926,23 +928,23 @@ class TestTorch(TestCase):
         res1val, res1ind = torch.kthvalue(x, k)
         res2val, res2ind = torch.sort(x)
 
-        self.assertEqual(res1val[:,:,0], res2val[:,:,k-1], 0)
-        self.assertEqual(res1ind[:,:,0], res2ind[:,:,k-1], 0)
+        self.assertEqual(res1val[:, :, 0], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :, 0], res2ind[:, :, k - 1], 0)
         # test use of result tensors
         k = random.randint(1, SIZE)
         res1val = torch.Tensor()
         res1ind = torch.LongTensor()
         torch.kthvalue(x, k, out=(res1val, res1ind))
         res2val, res2ind = torch.sort(x)
-        self.assertEqual(res1val[:,:,0], res2val[:,:,k-1], 0)
-        self.assertEqual(res1ind[:,:,0], res2ind[:,:,k-1], 0)
+        self.assertEqual(res1val[:, :, 0], res2val[:, :, k - 1], 0)
+        self.assertEqual(res1ind[:, :, 0], res2ind[:, :, k - 1], 0)
 
         # test non-default dim
         k = random.randint(1, SIZE)
         res1val, res1ind = torch.kthvalue(x, k, 0)
         res2val, res2ind = torch.sort(x, 0)
-        self.assertEqual(res1val[0], res2val[k-1], 0)
-        self.assertEqual(res1ind[0], res2ind[k-1], 0)
+        self.assertEqual(res1val[0], res2val[k - 1], 0)
+        self.assertEqual(res1ind[0], res2ind[k - 1], 0)
 
         # non-contiguous
         y = x.narrow(1, 0, 1)
@@ -968,7 +970,7 @@ class TestTorch(TestCase):
 
             res1val, res1ind = torch.median(x)
             res2val, res2ind = torch.sort(x)
-            ind = int(math.floor((size+1)/2) - 1)
+            ind = int(math.floor((size + 1) / 2) - 1)
 
             self.assertEqual(res2val.select(1, ind), res1val.select(1, 0), 0)
             self.assertEqual(res2val.select(1, ind), res1val.select(1, 0), 0)
@@ -992,15 +994,15 @@ class TestTorch(TestCase):
     def test_mode(self):
         x = torch.range(1, SIZE * SIZE).clone().resize_(SIZE, SIZE)
         x[:2] = 1
-        x[:,:2] = 1
+        x[:, :2] = 1
         x0 = x.clone()
 
         # Pre-calculated results.
         res1val = torch.Tensor(SIZE, 1).fill_(1)
         # The indices are the position of the last appearance of the mode element.
         res1ind = torch.LongTensor(SIZE, 1).fill_(1)
-        res1ind[0] = SIZE-1
-        res1ind[1] = SIZE-1
+        res1ind[0] = SIZE - 1
+        res1ind[1] = SIZE - 1
 
         res2val, res2ind = torch.mode(x)
 
@@ -1124,16 +1126,16 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_gesv(self):
-        a = torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
-                        (-6.05, -3.30,  5.36, -4.44,  1.08),
-                        (-0.45,  2.58, -2.70,  0.27,  9.04),
-                        (8.32,  2.71,  4.35, -7.17,  2.14),
-                        (-9.67, -5.14, -7.26,  6.08, -6.87))).t()
-        b = torch.Tensor(((4.02,  6.19, -8.22, -7.57, -3.03),
-                        (-1.56,  4.00, -8.67,  1.75,  2.86),
-                        (9.81, -4.09, -4.57, -8.61,  8.99))).t()
-
-        res1 = torch.gesv(b,a)[0]
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
+
+        res1 = torch.gesv(b, a)[0]
         self.assertLessEqual(b.dist(torch.mm(a, res1)), 1e-12)
         ta = torch.Tensor()
         tb = torch.Tensor()
@@ -1195,55 +1197,55 @@ class TestTorch(TestCase):
         a = torch.Tensor(((1, 2, 3), (4, 5, 6), (7, 8, 10)))
 
         expected_q = torch.Tensor((
-            (-1.230914909793328e-01,  9.045340337332914e-01, 4.082482904638621e-01),
-            (-4.923659639173310e-01,  3.015113445777629e-01, -8.164965809277264e-01),
+            (-1.230914909793328e-01, 9.045340337332914e-01, 4.082482904638621e-01),
+            (-4.923659639173310e-01, 3.015113445777629e-01, -8.164965809277264e-01),
             (-8.616404368553292e-01, -3.015113445777631e-01, 4.082482904638634e-01)))
         expected_r = torch.Tensor((
             (-8.124038404635959e+00, -9.601136296387955e+00, -1.193987e+01),
-            ( 0.000000000000000e+00,  9.045340337332926e-01, 1.507557e+00),
-            ( 0.000000000000000e+00,  0.000000000000000e+00, 4.082483e-01)))
+            (0.000000000000000e+00, 9.045340337332926e-01, 1.507557e+00),
+            (0.000000000000000e+00, 0.000000000000000e+00, 4.082483e-01)))
 
         check_qr(a, expected_q, expected_r)
 
         # check rectangular thin
         a = torch.Tensor((
-              ( 1,  2,  3),
-              ( 4,  5,  6),
-              ( 7,  8,  9),
-              (10, 11, 13),
-          ))
+            (1, 2, 3),
+            (4, 5, 6),
+            (7, 8, 9),
+            (10, 11, 13),
+        ))
         expected_q = torch.Tensor((
-            (-0.0776150525706334, -0.833052161400748 ,  0.3651483716701106),
+            (-0.0776150525706334, -0.833052161400748, 0.3651483716701106),
             (-0.3104602102825332, -0.4512365874254053, -0.1825741858350556),
             (-0.5433053679944331, -0.0694210134500621, -0.7302967433402217),
-            (-0.7761505257063329,  0.3123945605252804,  0.5477225575051663)
+            (-0.7761505257063329, 0.3123945605252804, 0.5477225575051663)
         ))
         expected_r = torch.Tensor((
             (-12.8840987267251261, -14.5916298832790581, -17.0753115655393231),
-            (  0,                  -1.0413152017509357,  -1.770235842976589 ),
-            (  0,                   0,                    0.5477225575051664)
+            (0, -1.0413152017509357, -1.770235842976589),
+            (0, 0, 0.5477225575051664)
         ))
 
         check_qr(a, expected_q, expected_r)
 
         # check rectangular fat
         a = torch.Tensor((
-              (1,  2,  3,  4),
-              (5,  6,  7,  8),
-              (9, 10, 11, 13)
-          ))
+            (1, 2, 3, 4),
+            (5, 6, 7, 8),
+            (9, 10, 11, 13)
+        ))
         expected_q = torch.Tensor((
-            (-0.0966736489045663,  0.907737593658436 ,  0.4082482904638653),
-            (-0.4833682445228317,  0.3157348151855452, -0.8164965809277254),
-            (-0.870062840141097 , -0.2762679632873518,  0.4082482904638621)
+            (-0.0966736489045663, 0.907737593658436, 0.4082482904638653),
+            (-0.4833682445228317, 0.3157348151855452, -0.8164965809277254),
+            (-0.870062840141097, -0.2762679632873518, 0.4082482904638621)
         ))
         expected_r = torch.Tensor((
-            ( -1.0344080432788603e+01,  -1.1794185166357092e+01,
-              -1.3244289899925587e+01,  -1.5564457473635180e+01),
-            (  0.0000000000000000e+00,   9.4720444555662542e-01,
-               1.8944088911132546e+00,   2.5653453733825331e+00),
-            (  0.0000000000000000e+00,   0.0000000000000000e+00,
-               1.5543122344752192e-15,   4.0824829046386757e-01)
+            (-1.0344080432788603e+01, -1.1794185166357092e+01,
+             -1.3244289899925587e+01, -1.5564457473635180e+01),
+            (0.0000000000000000e+00, 9.4720444555662542e-01,
+             1.8944088911132546e+00, 2.5653453733825331e+00),
+            (0.0000000000000000e+00, 0.0000000000000000e+00,
+             1.5543122344752192e-15, 4.0824829046386757e-01)
         ))
         check_qr(a, expected_q, expected_r)
 
@@ -1272,14 +1274,14 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_trtrs(self):
-        a = torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
-                        (-6.05, -3.30,  5.36, -4.44,  1.08),
-                        (-0.45,  2.58, -2.70,  0.27,  9.04),
-                        (8.32,  2.71,  4.35, -7.17,  2.14),
-                        (-9.67, -5.14, -7.26,  6.08, -6.87))).t()
-        b = torch.Tensor(((4.02,  6.19, -8.22, -7.57, -3.03),
-                        (-1.56,  4.00, -8.67,  1.75,  2.86),
-                        (9.81, -4.09, -4.57, -8.61,  8.99))).t()
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
 
         U = torch.triu(a)
         L = torch.tril(a)
@@ -1317,7 +1319,7 @@ class TestTorch(TestCase):
         self.assertLessEqual(x.dist(y), 1e-12)
 
         # test reuse
-        res1 = torch.trtrs(b,a)[0]
+        res1 = torch.trtrs(b, a)[0]
         ta = torch.Tensor()
         tb = torch.Tensor()
         torch.trtrs(b, a, out=(tb, ta))
@@ -1352,42 +1354,42 @@ class TestTorch(TestCase):
 
         # basic test
         expectedNorm = 0
-        a = torch.Tensor(((1.44, -9.96, -7.55,  8.34),
-                        (-7.84, -0.28,  3.24,  8.09),
-                        (-4.39, -3.24,  6.27,  5.28),
-                        (4.53,  3.83, -6.64,  2.06))).t()
-        b = torch.Tensor(((8.58,  8.26,  8.48, -5.28),
-                        (9.35, -4.43, -0.70, -0.26))).t()
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34),
+                          (-7.84, -0.28, 3.24, 8.09),
+                          (-4.39, -3.24, 6.27, 5.28),
+                          (4.53, 3.83, -6.64, 2.06))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28),
+                          (9.35, -4.43, -0.70, -0.26))).t()
         _test(a, b, expectedNorm)
 
         # test overderemined
         expectedNorm = 17.390200628863
-        a = torch.Tensor(((1.44, -9.96, -7.55,  8.34,  7.08, -5.45),
-                        (-7.84, -0.28,  3.24,  8.09,  2.52, -5.70),
-                        (-4.39, -3.24,  6.27,  5.28,  0.74, -1.19),
-                        (4.53,  3.83, -6.64,  2.06, -2.47,  4.70))).t()
-        b = torch.Tensor(((8.58,  8.26,  8.48, -5.28,  5.72,  8.93),
-                        (9.35, -4.43, -0.70, -0.26, -7.36, -2.52))).t()
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34, 7.08, -5.45),
+                          (-7.84, -0.28, 3.24, 8.09, 2.52, -5.70),
+                          (-4.39, -3.24, 6.27, 5.28, 0.74, -1.19),
+                          (4.53, 3.83, -6.64, 2.06, -2.47, 4.70))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28, 5.72, 8.93),
+                          (9.35, -4.43, -0.70, -0.26, -7.36, -2.52))).t()
         _test(a, b, expectedNorm)
 
         # test underdetermined
         expectedNorm = 0
         a = torch.Tensor(((1.44, -9.96, -7.55),
-                        (-7.84, -0.28,  3.24),
-                        (-4.39, -3.24,  6.27),
-                        (4.53,  3.83, -6.64))).t()
-        b = torch.Tensor(((8.58,  8.26,  8.48),
-                        (9.35, -4.43, -0.70))).t()
+                          (-7.84, -0.28, 3.24),
+                          (-4.39, -3.24, 6.27),
+                          (4.53, 3.83, -6.64))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48),
+                          (9.35, -4.43, -0.70))).t()
         _test(a, b, expectedNorm)
 
         # test reuse
         expectedNorm = 0
-        a = torch.Tensor(((1.44, -9.96, -7.55,  8.34),
-                        (-7.84, -0.28,  3.24,  8.09),
-                        (-4.39, -3.24,  6.27,  5.28),
-                        (4.53,  3.83, -6.64,  2.06))).t()
-        b = torch.Tensor(((8.58,  8.26,  8.48, -5.28),
-                        (9.35, -4.43, -0.70, -0.26))).t()
+        a = torch.Tensor(((1.44, -9.96, -7.55, 8.34),
+                          (-7.84, -0.28, 3.24, 8.09),
+                          (-4.39, -3.24, 6.27, 5.28),
+                          (4.53, 3.83, -6.64, 2.06))).t()
+        b = torch.Tensor(((8.58, 8.26, 8.48, -5.28),
+                          (9.35, -4.43, -0.70, -0.26))).t()
         ta = torch.Tensor()
         tb = torch.Tensor()
         torch.gels(b, a, out=(tb, ta))
@@ -1399,11 +1401,11 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_eig(self):
-        a = torch.Tensor(((1.96,  0.00,  0.00,  0.00,  0.00),
-                        (-6.49,  3.80,  0.00,  0.00,  0.00),
-                        (-0.47, -6.39,  4.17,  0.00,  0.00),
-                        (-7.20,  1.50, -1.51,  5.70,  0.00),
-                        (-0.65, -6.34,  2.67,  1.80, -7.10))).t().contiguous()
+        a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
+                          (-6.49, 3.80, 0.00, 0.00, 0.00),
+                          (-0.47, -6.39, 4.17, 0.00, 0.00),
+                          (-7.20, 1.50, -1.51, 5.70, 0.00),
+                          (-0.65, -6.34, 2.67, 1.80, -7.10))).t().contiguous()
         e = torch.eig(a)[0]
         ee, vv = torch.eig(a, True)
         te = torch.Tensor()
@@ -1416,9 +1418,9 @@ class TestTorch(TestCase):
         self.assertEqual(vv, tv, 1e-12)
 
         # test reuse
-        X = torch.randn(4,4)
+        X = torch.randn(4, 4)
         X = torch.mm(X.t(), X)
-        e, v = torch.zeros(4,2), torch.zeros(4,4)
+        e, v = torch.zeros(4, 2), torch.zeros(4, 4)
         torch.eig(X, True, out=(e, v))
         Xhat = torch.mm(torch.mm(v, torch.diag(e.select(1, 0))), v.t())
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
@@ -1432,8 +1434,8 @@ class TestTorch(TestCase):
         # test non-contiguous
         X = torch.randn(4, 4)
         X = torch.mm(X.t(), X)
-        e = torch.zeros(4, 2, 2)[:,1]
-        v = torch.zeros(4, 2, 4)[:,1]
+        e = torch.zeros(4, 2, 2)[:, 1]
+        v = torch.zeros(4, 2, 4)[:, 1]
         self.assertFalse(v.is_contiguous(), 'V is contiguous')
         self.assertFalse(e.is_contiguous(), 'E is contiguous')
         torch.eig(X, True, out=(e, v))
@@ -1442,10 +1444,10 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_symeig(self):
-        xval = torch.rand(100,3)
+        xval = torch.rand(100, 3)
         cov = torch.mm(xval.t(), xval)
         rese = torch.zeros(3)
-        resv = torch.zeros(3,3)
+        resv = torch.zeros(3, 3)
 
         # First call to symeig
         self.assertTrue(resv.is_contiguous(), 'resv is not contiguous')
@@ -1463,7 +1465,7 @@ class TestTorch(TestCase):
         X = torch.rand(5, 5)
         X = X.t() * X
         e = torch.zeros(4, 2).select(1, 1)
-        v = torch.zeros(4, 2, 4)[:,1]
+        v = torch.zeros(4, 2, 4)[:, 1]
         self.assertFalse(v.is_contiguous(), 'V is contiguous')
         self.assertFalse(e.is_contiguous(), 'E is contiguous')
         torch.symeig(X, True, out=(e, v))
@@ -1472,11 +1474,11 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_svd(self):
-        a=torch.Tensor(((8.79,  6.11, -9.15,  9.57, -3.49,  9.84),
-                        (9.93,  6.91, -7.93,  1.64,  4.02,  0.15),
-                        (9.83,  5.04,  4.86,  8.83,  9.80, -8.99),
-                        (5.45, -0.27,  4.85,  0.74, 10.00, -6.02),
-                        (3.16,  7.98,  3.01,  5.80,  4.27, -5.31))).t().clone()
+        a = torch.Tensor(((8.79, 6.11, -9.15, 9.57, -3.49, 9.84),
+                          (9.93, 6.91, -7.93, 1.64, 4.02, 0.15),
+                          (9.83, 5.04, 4.86, 8.83, 9.80, -8.99),
+                          (5.45, -0.27, 4.85, 0.74, 10.00, -6.02),
+                          (3.16, 7.98, 3.01, 5.80, 4.27, -5.31))).t().clone()
         u, s, v = torch.svd(a)
         uu = torch.Tensor()
         ss = torch.Tensor()
@@ -1502,9 +1504,9 @@ class TestTorch(TestCase):
 
         # test non-contiguous
         X = torch.randn(5, 5)
-        U = torch.zeros(5, 2, 5)[:,1]
-        S = torch.zeros(5, 2)[:,1]
-        V = torch.zeros(5, 2, 5)[:,1]
+        U = torch.zeros(5, 2, 5)[:, 1]
+        S = torch.zeros(5, 2)[:, 1]
+        V = torch.zeros(5, 2, 5)[:, 1]
 
         self.assertFalse(U.is_contiguous(), 'U is contiguous')
         self.assertFalse(S.is_contiguous(), 'S is contiguous')
@@ -1515,7 +1517,7 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_inverse(self):
-        M = torch.randn(5,5)
+        M = torch.randn(5, 5)
         MI = torch.inverse(M)
         E = torch.eye(5)
         self.assertFalse(MI.is_contiguous(), 'MI is contiguous')
@@ -1542,9 +1544,9 @@ class TestTorch(TestCase):
         ki = k.clone()
         ks = k.storage()
         kis = ki.storage()
-        for i in range(ks.size()-1, 0, -1):
-            kis[ks.size()-i+1] = ks[i]
-        #for i=ks.size(), 1, -1 do kis[ks.size()-i+1]=ks[i] end
+        for i in range(ks.size() - 1, 0, -1):
+            kis[ks.size() - i + 1] = ks[i]
+        # for i=ks.size(), 1, -1 do kis[ks.size()-i+1]=ks[i] end
         imvx = torch.xcorr2(x, ki)
         imvx2 = torch.xcorr2(x, ki, 'V')
         imfx = torch.xcorr2(x, ki, 'F')
@@ -1575,20 +1577,20 @@ class TestTorch(TestCase):
     @unittest.skip("Not implemented yet")
     def test_conv3(self):
         x = torch.rand(math.floor(torch.uniform(20, 40)),
-                math.floor(torch.uniform(20, 40)),
-                math.floor(torch.uniform(20, 40)))
+                       math.floor(torch.uniform(20, 40)),
+                       math.floor(torch.uniform(20, 40)))
         k = torch.rand(math.floor(torch.uniform(5, 10)),
-                math.floor(torch.uniform(5, 10)),
-                math.floor(torch.uniform(5, 10)))
+                       math.floor(torch.uniform(5, 10)),
+                       math.floor(torch.uniform(5, 10)))
         imvc = torch.conv3(x, k)
         imvc2 = torch.conv3(x, k, 'V')
         imfc = torch.conv3(x, k, 'F')
 
-        ki = k.clone();
+        ki = k.clone()
         ks = k.storage()
         kis = ki.storage()
-        for i in range(ks.size()-1, 0, -1):
-            kis[ks.size()-i+1] = ks[i]
+        for i in range(ks.size() - 1, 0, -1):
+            kis[ks.size() - i + 1] = ks[i]
         imvx = torch.xcorr3(x, ki)
         imvx2 = torch.xcorr3(x, ki, 'V')
         imfx = torch.xcorr3(x, ki, 'F')
@@ -1638,7 +1640,7 @@ class TestTorch(TestCase):
         def reference(x, k, o3, o32):
             for i in range(o3.size(1)):
                 for j in range(k.size(1)):
-                    o32[i].add(torch.xcorr2(x[i+j-1], k[j]))
+                    o32[i].add(torch.xcorr2(x[i + j - 1], k[j]))
         self._test_conv_corr_eq(lambda x, k: torch.xcorr3(x, k), reference)
 
     @unittest.skip("Not implemented yet")
@@ -1654,7 +1656,7 @@ class TestTorch(TestCase):
         def reference(x, k, o3, o32):
             for i in range(o3.size(1)):
                 for j in range(k.size(1)):
-                    o32[i].add(torch.conv2(x[i+j-1], k[k.size(1)-j+1]))
+                    o32[i].add(torch.conv2(x[i + j - 1], k[k.size(1) - j + 1]))
         self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k), reference)
 
     @unittest.skip("Not implemented yet")
@@ -1662,7 +1664,7 @@ class TestTorch(TestCase):
         def reference(x, k, o3, o32):
             for i in range(o3.size(1)):
                 for j in range(k.size(1)):
-                    o32[i+j-1].add(torch.conv2(x[i], k[j], 'F'))
+                    o32[i + j - 1].add(torch.conv2(x[i], k[j], 'F'))
         self._test_conv_corr_eq(lambda x, k: torch.conv3(x, k, 'F'), reference)
 
     def test_logical(self):
@@ -1714,9 +1716,9 @@ class TestTorch(TestCase):
         torch.manual_seed(123)
         reseeded = torch.randn(odd_number)
         self.assertEqual(midstream, repeat_midstream, 0,
-                'get_rng_state/set_rng_state not generating same sequence of normally distributed numbers')
+                         'get_rng_state/set_rng_state not generating same sequence of normally distributed numbers')
         self.assertEqual(seeded, reseeded, 0,
-                'repeated calls to manual_seed not generating same sequence of normally distributed numbers')
+                         'repeated calls to manual_seed not generating same sequence of normally distributed numbers')
 
     def test_manual_seed(self):
         rng_state = torch.get_rng_state()
@@ -1750,14 +1752,14 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def test_potrs(self):
-        a=torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
-                        (-6.05, -3.30,  5.36, -4.44,  1.08),
-                        (-0.45,  2.58, -2.70,  0.27,  9.04),
-                        (8.32,  2.71,  4.35, -7.17,  2.14),
-                        (-9.67, -5.14, -7.26,  6.08, -6.87))).t()
-        b=torch.Tensor(((4.02,  6.19, -8.22, -7.57, -3.03),
-                        (-1.56,  4.00, -8.67,  1.75,  2.86),
-                        (9.81, -4.09, -4.57, -8.61,  8.99))).t()
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
+        b = torch.Tensor(((4.02, 6.19, -8.22, -7.57, -3.03),
+                          (-1.56, 4.00, -8.67, 1.75, 2.86),
+                          (9.81, -4.09, -4.57, -8.61, 8.99))).t()
 
         # make sure 'a' is symmetric PSD
         a = torch.mm(a, a.t())
@@ -1774,11 +1776,11 @@ class TestTorch(TestCase):
 
     @skipIfNoLapack
     def tset_potri(self):
-        a=torch.Tensor(((6.80, -2.11,  5.66,  5.97,  8.23),
-                        (-6.05, -3.30,  5.36, -4.44,  1.08),
-                        (-0.45,  2.58, -2.70,  0.27,  9.04),
-                        (8.32,  2.71,  4.35, -7.17,  2.14),
-                        (-9.67, -5.14, -7.26,  6.08, -6.87))).t()
+        a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
+                          (-6.05, -3.30, 5.36, -4.44, 1.08),
+                          (-0.45, 2.58, -2.70, 0.27, 9.04),
+                          (8.32, 2.71, 4.35, -7.17, 2.14),
+                          (-9.67, -5.14, -7.26, 6.08, -6.87))).t()
 
         # make sure 'a' is symmetric PSD
         a = a * a.t()
@@ -1839,8 +1841,8 @@ class TestTorch(TestCase):
 
     def test_numel(self):
         b = torch.ByteTensor(3, 100, 100)
-        self.assertEqual(b.nelement(), 3*100*100)
-        self.assertEqual(b.numel(), 3*100*100)
+        self.assertEqual(b.nelement(), 3 * 100 * 100)
+        self.assertEqual(b.numel(), 3 * 100 * 100)
 
     def _consecutive(self, size, start=1):
         sequence = torch.ones(int(torch.Tensor(size).prod(0)[0])).cumsum(0)
@@ -1889,6 +1891,7 @@ class TestTorch(TestCase):
     def test_newindex(self):
         reference = self._consecutive((3, 3, 3))
         # This relies on __index__() being correct - but we have separate tests for that
+
         def checkPartialAssign(index):
             reference = torch.zeros(3, 3, 3)
             reference[index] = self._consecutive((3, 3, 3))[index]
@@ -1968,7 +1971,7 @@ class TestTorch(TestCase):
             for j in range(1 if dim == 1 else n):
                 for k in range(1 if dim == 2 else o):
                     ii = [i, j, k]
-                    ii[dim] = slice(0, idx.size(dim)+1)
+                    ii[dim] = slice(0, idx.size(dim) + 1)
                     idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row]
 
     def test_gather(self):
@@ -1988,8 +1991,8 @@ class TestTorch(TestCase):
             for j in range(idx_size[1]):
                 for k in range(idx_size[2]):
                     ii = [i, j, k]
-                    ii[dim] = idx[i,j,k]
-                    expected[i,j,k] = src[tuple(ii)]
+                    ii[dim] = idx[i, j, k]
+                    expected[i, j, k] = src[tuple(ii)]
         self.assertEqual(actual, expected, 0)
 
         idx[0][0][0] = 23
@@ -2017,8 +2020,8 @@ class TestTorch(TestCase):
             for j in range(idx_size[1]):
                 for k in range(idx_size[2]):
                     ii = [i, j, k]
-                    ii[dim] = idx[i,j,k]
-                    expected[tuple(ii)] = src[i,j,k]
+                    ii[dim] = idx[i, j, k]
+                    expected[tuple(ii)] = src[i, j, k]
         self.assertEqual(actual, expected, 0)
 
         idx[0][0][0] = 34
@@ -2041,7 +2044,7 @@ class TestTorch(TestCase):
             for j in range(idx_size[1]):
                 for k in range(idx_size[2]):
                     ii = [i, j, k]
-                    ii[dim] = idx[i,j,k]
+                    ii[dim] = idx[i, j, k]
                     expected[tuple(ii)] = val
         self.assertEqual(actual, expected, 0)
 
@@ -2109,7 +2112,7 @@ class TestTorch(TestCase):
             self.assertEqual(res.abs(), data, 1e-16)
 
         # Checking that the right abs function is called for LongTensor
-        bignumber = 2^31 + 1
+        bignumber = 2 ^ 31 + 1
         res = torch.LongTensor((-bignumber,))
         self.assertGreater(res.abs()[0], 0)
 
@@ -2125,7 +2128,7 @@ class TestTorch(TestCase):
         self.assertEqual(tensor.view(3, -1).size(), target)
         tensor_view = tensor.view(5, 3)
         tensor_view.fill_(random.uniform(0, 1))
-        self.assertEqual((tensor_view-tensor).abs().max(), 0)
+        self.assertEqual((tensor_view - tensor).abs().max(), 0)
         self.assertEqual(empty.view_as(empty), empty)
         self.assertEqual(empty.view(0), empty)
 
@@ -2150,7 +2153,7 @@ class TestTorch(TestCase):
         self.assertEqual(result.size(), target, 'Error in repeat using result')
         result = tensor.repeat(torchSize)
         self.assertEqual(result.size(), target, 'Error in repeat using result and LongStorage')
-        self.assertEqual((result.mean(0).view(8, 4)-tensor).abs().max(), 0, 'Error in repeat (not equal)')
+        self.assertEqual((result.mean(0).view(8, 4) - tensor).abs().max(), 0, 'Error in repeat (not equal)')
 
     def test_is_same_size(self):
         t1 = torch.Tensor(3, 4, 9, 10)
@@ -2172,8 +2175,8 @@ class TestTorch(TestCase):
         self.assertTrue(t3.is_set_to(t1), "is_set_to should be symmetric")
         self.assertFalse(t1.is_set_to(t4))
         self.assertFalse(torch.Tensor().is_set_to(torch.Tensor()),
-                "Tensors with no storages should not appear to be set "
-                "to each other")
+                         "Tensors with no storages should not appear to be set "
+                         "to each other")
 
     def test_tensor_set(self):
         t1 = torch.Tensor()
@@ -2208,7 +2211,7 @@ class TestTorch(TestCase):
 
         # Non contiguous, 2D
         s = torch.Tensor(((1, 2, 3, 4), (5, 6, 7, 8)))
-        s1 = s[:,1:3]
+        s1 = s[:, 1:3]
         s2 = s1.clone()
         s3 = torch.Tensor(((2, 3), (6, 7)))
         s4 = torch.Tensor(((0, 0), (0, 0)))
@@ -2222,20 +2225,20 @@ class TestTorch(TestCase):
         self.assertFalse(torch.equal(s1, s4))
 
     def test_element_size(self):
-        byte   =   torch.ByteStorage().element_size()
-        char   =   torch.CharStorage().element_size()
-        short  =  torch.ShortStorage().element_size()
-        int    =    torch.IntStorage().element_size()
-        long   =   torch.LongStorage().element_size()
-        float  =  torch.FloatStorage().element_size()
+        byte = torch.ByteStorage().element_size()
+        char = torch.CharStorage().element_size()
+        short = torch.ShortStorage().element_size()
+        int = torch.IntStorage().element_size()
+        long = torch.LongStorage().element_size()
+        float = torch.FloatStorage().element_size()
         double = torch.DoubleStorage().element_size()
 
-        self.assertEqual(byte,   torch.ByteTensor().element_size())
-        self.assertEqual(char,   torch.CharTensor().element_size())
-        self.assertEqual(short,  torch.ShortTensor().element_size())
-        self.assertEqual(int,    torch.IntTensor().element_size())
-        self.assertEqual(long,   torch.LongTensor().element_size())
-        self.assertEqual(float,  torch.FloatTensor().element_size())
+        self.assertEqual(byte, torch.ByteTensor().element_size())
+        self.assertEqual(char, torch.CharTensor().element_size())
+        self.assertEqual(short, torch.ShortTensor().element_size())
+        self.assertEqual(int, torch.IntTensor().element_size())
+        self.assertEqual(long, torch.LongTensor().element_size())
+        self.assertEqual(float, torch.FloatTensor().element_size())
         self.assertEqual(double, torch.DoubleTensor().element_size())
 
         self.assertGreater(byte, 0)
@@ -2366,12 +2369,12 @@ class TestTorch(TestCase):
                     # This test will allow through some False positives. It only checks
                     # that the elements flagged positive are indeed non-zero.
                     for i in range(dst1.size(0)):
-                        self.assertNotEqual(tensor[dst1[i,0], dst1[i,1]], 0)
+                        self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1]], 0)
                 elif len(shape) == 3:
-                # This test will allow through some False positives. It only checks
-                # that the elements flagged positive are indeed non-zero.
+                    # This test will allow through some False positives. It only checks
+                    # that the elements flagged positive are indeed non-zero.
                     for i in range(dst1.size(0)):
-                        self.assertNotEqual(tensor[dst1[i,0], dst1[i,1], dst1[i,2]], 0)
+                        self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]], 0)
 
     def test_deepcopy(self):
         from copy import deepcopy
@@ -2444,8 +2447,8 @@ class TestTorch(TestCase):
         std = torch.Tensor(100, 100)
         mean[:50] = 0
         mean[50:] = 1
-        std[:,:50] = 4
-        std[:,50:] = 1
+        std[:, :50] = 4
+        std[:, 50:] = 1
 
         r = torch.normal(mean)
         self.assertEqual(r[:50].mean(), 0, 0.2)
@@ -2459,14 +2462,14 @@ class TestTorch(TestCase):
 
         r = torch.normal(2, std)
         self.assertEqual(r.mean(), 2, 0.2)
-        self.assertEqual(r[:,:50].std(), 4, 0.3)
-        self.assertEqual(r[:,50:].std(), 1, 0.2)
+        self.assertEqual(r[:, :50].std(), 4, 0.3)
+        self.assertEqual(r[:, 50:].std(), 1, 0.2)
 
         r = torch.normal(mean, std)
         self.assertEqual(r[:50].mean(), 0, 0.2)
         self.assertEqual(r[50:].mean(), 1, 0.2)
-        self.assertEqual(r[:,:50].std(), 4, 0.3)
-        self.assertEqual(r[:,50:].std(), 1, 0.2)
+        self.assertEqual(r[:, :50].std(), 4, 0.3)
+        self.assertEqual(r[:, 50:].std(), 1, 0.2)
 
     def test_serialization(self):
         a = [torch.randn(5, 5).float() for i in range(2)]
@@ -2552,7 +2555,7 @@ class TestTorch(TestCase):
             obj.__repr__()
             str(obj)
         for t in torch._storage_classes:
-            if  t.is_cuda and not torch.cuda.is_available():
+            if t.is_cuda and not torch.cuda.is_available():
                 continue
             obj = t(100).fill_(1)
             obj.__repr__()
@@ -2633,7 +2636,7 @@ class TestTorch(TestCase):
 
             # 1D > 0 storage offset
             xm = torch.randn(sz * 2).mul(255).type(tp)
-            x = xm.narrow(0, sz-1, sz)
+            x = xm.narrow(0, sz - 1, sz)
             self.assertTrue(x.storage_offset() > 0)
             y = x.numpy()
             for i in range(sz):
@@ -2658,7 +2661,7 @@ class TestTorch(TestCase):
 
             # with storage offset
             xm = torch.randn(sz1 * 2, sz2).mul(255).type(tp)
-            x = xm.narrow(0, sz1-1, sz1)
+            x = xm.narrow(0, sz1 - 1, sz1)
             y = x.numpy()
             self.assertTrue(x.storage_offset() > 0)
             check2d(x, y)
@@ -2670,14 +2673,14 @@ class TestTorch(TestCase):
 
             # with storage offset
             xm = torch.randn(sz2 * 2, sz1).mul(255).type(tp)
-            x = xm.narrow(0, sz2-1, sz2).t()
+            x = xm.narrow(0, sz2 - 1, sz2).t()
             y = x.numpy()
             self.assertTrue(x.storage_offset() > 0)
             check2d(x, y)
 
             # non-contiguous 2D with holes
             xm = torch.randn(sz2 * 2, sz1 * 2).mul(255).type(tp)
-            x = xm.narrow(0, sz2-1, sz2).narrow(1, sz1-1, sz1).t()
+            x = xm.narrow(0, sz2 - 1, sz2).narrow(1, sz1 - 1, sz1).t()
             y = x.numpy()
             self.assertTrue(x.storage_offset() > 0)
             check2d(x, y)
diff --git a/test/test_utils.py b/test/test_utils.py
index 2b842a9837..bf6946e67a 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -28,7 +28,9 @@ try:
 except ImportError:
     HAS_CFFI = False
 
+
 class SimplePlugin(Plugin):
+
     def __init__(self, interval):
         super(SimplePlugin, self).__init__(interval)
         self.trainer = None
@@ -58,6 +60,7 @@ class SimplePlugin(Plugin):
 
 
 class ModelMock(object):
+
     def __init__(self):
         self.num_calls = 0
         self.output = Variable(torch.ones(1, 1), requires_grad=True)
@@ -68,6 +71,7 @@ class ModelMock(object):
 
 
 class CriterionMock(object):
+
     def __init__(self):
         self.num_calls = 0
 
@@ -95,6 +99,7 @@ class OptimizerMock(object):
 
 
 class DatasetMock(object):
+
     def __iter__(self):
         for i in range(10):
             yield torch.randn(2, 10), torch.randperm(10)[:2]
@@ -183,6 +188,7 @@ class TestTrainer(TestCase):
 
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
+
 class TestFFI(TestCase):
 
     def setUp(self):
@@ -196,13 +202,13 @@ class TestFFI(TestCase):
     @unittest.skipIf(not HAS_CFFI, "ffi tests require cffi package")
     def test_cpu(self):
         compile_extension(
-                name='test_extensions.cpulib',
-                header=test_dir + '/ffi/src/cpu/lib.h',
-                sources=[
-                    test_dir + '/ffi/src/cpu/lib1.c',
-                    test_dir + '/ffi/src/cpu/lib2.c',
-                ],
-                verbose=False,
+            name='test_extensions.cpulib',
+            header=test_dir + '/ffi/src/cpu/lib.h',
+            sources=[
+                test_dir + '/ffi/src/cpu/lib1.c',
+                test_dir + '/ffi/src/cpu/lib2.c',
+            ],
+            verbose=False,
         )
         from test_extensions import cpulib
         tensor = torch.ones(2, 2).float()
@@ -217,20 +223,20 @@ class TestFFI(TestCase):
         self.assertIs(type(f), float)
 
         self.assertRaises(TypeError,
-                lambda: cpulib.good_func(tensor.double(), 2, 1.5))
+                          lambda: cpulib.good_func(tensor.double(), 2, 1.5))
         self.assertRaises(torch.FatalError,
-                lambda: cpulib.bad_func(tensor, 2, 1.5))
+                          lambda: cpulib.bad_func(tensor, 2, 1.5))
 
     @unittest.skipIf(not HAS_CFFI or not HAS_CUDA, "ffi tests require cffi package")
     def test_gpu(self):
         compile_extension(
-                name='gpulib',
-                header=test_dir + '/ffi/src/cuda/cudalib.h',
-                sources=[
-                    test_dir + '/ffi/src/cuda/cudalib.c',
-                ],
-                with_cuda=True,
-                verbose=False,
+            name='gpulib',
+            header=test_dir + '/ffi/src/cuda/cudalib.h',
+            sources=[
+                test_dir + '/ffi/src/cuda/cudalib.c',
+            ],
+            with_cuda=True,
+            verbose=False,
         )
         import gpulib
         tensor = torch.ones(2, 2).float()
@@ -243,9 +249,9 @@ class TestFFI(TestCase):
         self.assertEqual(ctensor, torch.ones(2, 2) * 2 + 1.5)
 
         self.assertRaises(TypeError,
-                lambda: gpulib.cuda_func(tensor, 2, 1.5))
+                          lambda: gpulib.cuda_func(tensor, 2, 1.5))
         self.assertRaises(TypeError,
-                lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
+                          lambda: gpulib.cuda_func(ctensor.storage(), 2, 1.5))
 
 
 class TestLuaReader(TestCase):
@@ -320,7 +326,7 @@ class TestLuaReader(TestCase):
             cls._download_data(test_file_path)
         except urllib.URLError as e:
             warnings.warn(("Couldn't download the test file for TestLuaReader! "
-                    "Tests will be incomplete!"), RuntimeWarning)
+                           "Tests will be incomplete!"), RuntimeWarning)
             return
 
         tests = load_lua(test_file_path)
diff --git a/tools/cwrap/cwrap.py b/tools/cwrap/cwrap.py
index 0f5f26a7fb..b763a0ed59 100644
--- a/tools/cwrap/cwrap.py
+++ b/tools/cwrap/cwrap.py
@@ -20,13 +20,14 @@ class cwrap(object):
     """)
 
     OPTION_CODE_TEMPLATE = [
-      '$call',
-      '$return_result',
+        '$call',
+        '$return_result',
     ]
 
     FUNCTION_CALL_TEMPLATE = Template("$capture_result$cname($arg_unpack);")
 
-    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments, ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]
+    DEFAULT_PLUGIN_CLASSES = [ArgcountChecker, ConstantArguments, OptionalArguments,
+                              ArgumentReferences, BeforeAfterCall, ReturnArguments, GILRelease]
 
     def __init__(self, source, destination=None, plugins=[], default_plugins=True):
         if destination is None:
@@ -87,7 +88,7 @@ class cwrap(object):
                 with open(fname, 'r') as f:
                     included = f.read().split('\n')
                 # insert it into lines at position i+1
-                lines[i+1:i+1] = included
+                lines[i + 1:i + 1] = included
             else:
                 output.append(line)
             i += 1
@@ -136,10 +137,10 @@ class cwrap(object):
         return fallback(*args)
 
     def get_type_check(self, arg, option):
-        return self.search_plugins('get_type_check', (arg, option), lambda arg,_: None)
+        return self.search_plugins('get_type_check', (arg, option), lambda arg, _: None)
 
     def get_type_unpack(self, arg, option):
-        return self.search_plugins('get_type_unpack', (arg, option), lambda arg,_: None)
+        return self.search_plugins('get_type_unpack', (arg, option), lambda arg, _: None)
 
     def get_return_wrapper(self, option):
         return self.search_plugins('get_return_wrapper', (option,), lambda _: self.RETURN_WRAPPERS[option['return']])
@@ -193,14 +194,14 @@ class cwrap(object):
 
         # Generate checks
         arg_checks = self.map_selected_arguments('get_type_check',
-                'process_single_check', option, checked_args)
+                                                 'process_single_check', option, checked_args)
         arg_checks = ' &&\n          '.join(arg_checks)
         for plugin in self.plugins:
             arg_checks = plugin.process_all_checks(arg_checks, option)
 
         # Generate unpacks
         arg_unpack = self.map_selected_arguments('get_type_unpack',
-                'process_single_unpack', option, option['arguments'])
+                                                 'process_single_unpack', option, option['arguments'])
         arg_unpack = ', '.join(arg_unpack)
         for plugin in self.plugins:
             arg_unpack = plugin.process_all_unpacks(arg_unpack, option)
@@ -209,16 +210,16 @@ class cwrap(object):
         try:
             return_result = self.get_return_wrapper(option).substitute()
             call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result='',
-                    cname=option['cname'], arg_unpack=arg_unpack)
+                                                          cname=option['cname'], arg_unpack=arg_unpack)
         except KeyError:
             return_result = self.get_return_wrapper(option).substitute(result='__result')
             call = self.FUNCTION_CALL_TEMPLATE.substitute(capture_result=(option['return'] + ' __result = '),
-                    cname=option['cname'], arg_unpack=arg_unpack)
+                                                          cname=option['cname'], arg_unpack=arg_unpack)
 
         code_template = deepcopy(self.OPTION_CODE_TEMPLATE)
         for plugin in self.plugins:
             code_template = plugin.process_option_code_template(code_template,
-                    option)
+                                                                option)
         code_template = Template('\n'.join(code_template))
         code = code_template.substitute(call=call, return_result=return_result)
         code_lines = map(lambda s: s.strip(), code.split('\n'))
diff --git a/tools/cwrap/plugins/ArgcountChecker.py b/tools/cwrap/plugins/ArgcountChecker.py
index 2572327cc2..5852dc9f3f 100644
--- a/tools/cwrap/plugins/ArgcountChecker.py
+++ b/tools/cwrap/plugins/ArgcountChecker.py
@@ -1,5 +1,6 @@
 from . import CWrapPlugin
 
+
 class ArgcountChecker(CWrapPlugin):
 
     def process_all_checks(self, checks, option):
diff --git a/tools/cwrap/plugins/ArgcountSortPlugin.py b/tools/cwrap/plugins/ArgcountSortPlugin.py
index b77bd95666..d82b9a0b33 100644
--- a/tools/cwrap/plugins/ArgcountSortPlugin.py
+++ b/tools/cwrap/plugins/ArgcountSortPlugin.py
@@ -1,5 +1,6 @@
 from . import CWrapPlugin
 
+
 class ArgcountSortPlugin(CWrapPlugin):
 
     def __init__(self, descending=True):
@@ -11,4 +12,3 @@ class ArgcountSortPlugin(CWrapPlugin):
         for declaration in declarations:
             declaration['options'].sort(key=num_checked_args, reverse=self.descending)
         return declarations
-
diff --git a/tools/cwrap/plugins/ArgumentReferences.py b/tools/cwrap/plugins/ArgumentReferences.py
index fa1900e780..ab341b8dc0 100644
--- a/tools/cwrap/plugins/ArgumentReferences.py
+++ b/tools/cwrap/plugins/ArgumentReferences.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class ArgumentReferences(CWrapPlugin):
 
     def initialize(self, cwrap):
diff --git a/tools/cwrap/plugins/AutoGPU.py b/tools/cwrap/plugins/AutoGPU.py
index f3ff6a2c47..65ffc69cd2 100644
--- a/tools/cwrap/plugins/AutoGPU.py
+++ b/tools/cwrap/plugins/AutoGPU.py
@@ -1,5 +1,6 @@
 from . import CWrapPlugin
 
+
 class AutoGPU(CWrapPlugin):
 
     def __init__(self, has_self=True, condition=None):
diff --git a/tools/cwrap/plugins/BeforeAfterCall.py b/tools/cwrap/plugins/BeforeAfterCall.py
index e6b5584552..407b88d65e 100644
--- a/tools/cwrap/plugins/BeforeAfterCall.py
+++ b/tools/cwrap/plugins/BeforeAfterCall.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class BeforeAfterCall(CWrapPlugin):
 
     def initialize(self, cwrap):
@@ -13,7 +14,7 @@ class BeforeAfterCall(CWrapPlugin):
         if '$' in prepend_str:
             before_call_template = Template(option[name])
             args = {'arg' + str(i): self.cwrap.get_arg_accessor(arg, option) for i, arg
-                        in enumerate(option['arguments'])}
+                    in enumerate(option['arguments'])}
             prepend_str = before_call_template.substitute(args)
         template.insert(offset, prepend_str)
 
@@ -23,5 +24,5 @@ class BeforeAfterCall(CWrapPlugin):
             self.insert_snippet(template, option, call_idx, 'before_call')
             # call position might have changed
             call_idx = template.index('$call')
-            self.insert_snippet(template, option, call_idx+1, 'after_call')
+            self.insert_snippet(template, option, call_idx + 1, 'after_call')
         return template
diff --git a/tools/cwrap/plugins/BoolOption.py b/tools/cwrap/plugins/BoolOption.py
index ba81e3b509..c686aa4ffd 100644
--- a/tools/cwrap/plugins/BoolOption.py
+++ b/tools/cwrap/plugins/BoolOption.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class BoolOption(CWrapPlugin):
 
     UNPACK_TEMPLATE = Template('$arg == Py_True ? $if_true : $if_false')
@@ -16,4 +17,3 @@ class BoolOption(CWrapPlugin):
         if self.is_bool_option(arg):
             return Template(self.UNPACK_TEMPLATE.safe_substitute(
                 if_true=arg['if_true'], if_false=arg['if_false']))
-
diff --git a/tools/cwrap/plugins/ConstantArguments.py b/tools/cwrap/plugins/ConstantArguments.py
index 4d30345e9e..7a67ebe621 100644
--- a/tools/cwrap/plugins/ConstantArguments.py
+++ b/tools/cwrap/plugins/ConstantArguments.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class ConstantArguments(CWrapPlugin):
 
     def process_declarations(self, declarations):
@@ -18,5 +19,3 @@ class ConstantArguments(CWrapPlugin):
     def get_arg_accessor(self, arg, option):
         if arg['type'] == 'CONSTANT':
             return arg['name']
-
-
diff --git a/tools/cwrap/plugins/CuDNNPlugin.py b/tools/cwrap/plugins/CuDNNPlugin.py
index 76432fbfce..6ee7395e0a 100644
--- a/tools/cwrap/plugins/CuDNNPlugin.py
+++ b/tools/cwrap/plugins/CuDNNPlugin.py
@@ -3,30 +3,31 @@ from copy import deepcopy
 from . import CWrapPlugin
 from itertools import product
 
+
 class CuDNNPlugin(CWrapPlugin):
 
     TYPE_UNPACK = {
-        'THTensor*':        Template('((THPVoidTensor*)$arg)->cdata'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
+        'THTensor*': Template('((THPVoidTensor*)$arg)->cdata'),
+        'int': Template('THPUtils_unpackLong($arg)'),
         'std::vector<int>': Template('THPUtils_unpackIntTuple($arg)'),
-        'cudnnDataType_t':  Template('$arg'),
-        'cudnnHandle_t':    Template('$arg'),
-        'Convolution*':     Template('(Convolution*)THPWrapper_get($arg)'),
-        'bool':             Template('$arg == Py_True'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
+        'cudnnDataType_t': Template('$arg'),
+        'cudnnHandle_t': Template('$arg'),
+        'Convolution*': Template('(Convolution*)THPWrapper_get($arg)'),
+        'bool': Template('$arg == Py_True'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
     }
 
     TYPE_CHECK = {
-        'Convolution*':     Template('THPWrapper_check($arg)'),
-        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
-        'int':              Template('THPUtils_checkLong($arg)'),
+        'Convolution*': Template('THPWrapper_check($arg)'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == tensorClass'),
+        'int': Template('THPUtils_checkLong($arg)'),
         'std::vector<int>': Template('THPUtils_checkIntTuple($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
     }
 
     RETURN_WRAPPER = {
-        'Convolution*':     Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
+        'Convolution*': Template('return THPWrapper_New($result, [](void* arg) { delete (Convolution*)arg; });'),
     }
 
     METHODS_DECLARATION = Template("""
@@ -151,8 +152,8 @@ static PyObject * $name(PyObject *self, PyObject *args, PyObject *kwargs)
             if not declaration.get('only_register'):
                 extra_flags += ' | METH_KEYWORDS'
             entry = Template('  {"$python_name", (PyCFunction)$name, METH_VARARGS$extra_flags, NULL},\n').substitute(
-                    python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
-                )
+                python_name=declaration['python_name'], name=declaration['name'], extra_flags=extra_flags
+            )
             if 'defined_if' in declaration:
                 entry = self.preprocessor_guard(entry, declaration['defined_if'])
             methods += entry
diff --git a/tools/cwrap/plugins/GILRelease.py b/tools/cwrap/plugins/GILRelease.py
index 0be754847b..f6f435cc34 100644
--- a/tools/cwrap/plugins/GILRelease.py
+++ b/tools/cwrap/plugins/GILRelease.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class GILRelease(CWrapPlugin):
 
     OPTION_START = [
@@ -24,6 +25,5 @@ class GILRelease(CWrapPlugin):
     def process_option_code_template(self, template, option):
         call_idx = template.index('$call')
         template.insert(call_idx, self.BEFORE_CALL)
-        template.insert(call_idx+2, self.AFTER_CALL)
+        template.insert(call_idx + 2, self.AFTER_CALL)
         return self.OPTION_START + template + self.OPTION_END
-
diff --git a/tools/cwrap/plugins/KwargsPlugin.py b/tools/cwrap/plugins/KwargsPlugin.py
index 85e7afabac..e4f598cc44 100644
--- a/tools/cwrap/plugins/KwargsPlugin.py
+++ b/tools/cwrap/plugins/KwargsPlugin.py
@@ -1,6 +1,7 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class KwargsPlugin(CWrapPlugin):
 
     ACCESSOR_TEMPLATE = Template('(__tuplecount > $idx ? PyTuple_GET_ITEM(args, $idx) : __kw_$name)')
@@ -53,7 +54,8 @@ class KwargsPlugin(CWrapPlugin):
                     seen_args.add(name)
                     args.append(name)
         declarations = '\n    '.join(['PyObject *__kw_{} = NULL;'.format(name) for name in args])
-        lookups = '\n      '.join(['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=name) for name in args])
+        lookups = '\n      '.join(
+            ['__kw_{name} = PyDict_GetItemString(kwargs, "{name}");'.format(name=name) for name in args])
         start_idx = code.find('{') + 1
         new_code = self.WRAPPER_TEMPLATE.substitute(declarations=declarations, lookups=lookups)
         return code[:start_idx] + new_code + code[start_idx:]
diff --git a/tools/cwrap/plugins/NullableArguments.py b/tools/cwrap/plugins/NullableArguments.py
index 47c1b0c0ab..b69c96fa74 100644
--- a/tools/cwrap/plugins/NullableArguments.py
+++ b/tools/cwrap/plugins/NullableArguments.py
@@ -1,6 +1,8 @@
 from . import CWrapPlugin
 
+
 class NullableArguments(CWrapPlugin):
+
     def process_single_check(self, code, arg, arg_accessor):
         if 'nullable' in arg and arg['nullable']:
             return '({} || {} == Py_None)'.format(code, arg_accessor)
@@ -10,5 +12,3 @@ class NullableArguments(CWrapPlugin):
         if 'nullable' in arg and arg['nullable']:
             return '({} == Py_None ? NULL : {})'.format(arg_accessor, code)
         return code
-
-
diff --git a/tools/cwrap/plugins/OptionalArguments.py b/tools/cwrap/plugins/OptionalArguments.py
index 0bbc12d6d4..6cb24e1aee 100644
--- a/tools/cwrap/plugins/OptionalArguments.py
+++ b/tools/cwrap/plugins/OptionalArguments.py
@@ -2,6 +2,7 @@ from copy import deepcopy
 from . import CWrapPlugin
 from itertools import product
 
+
 class OptionalArguments(CWrapPlugin):
 
     def process_declarations(self, declarations):
@@ -32,20 +33,20 @@ class OptionalArguments(CWrapPlugin):
             else:
                 kwarg_only_count = -kwarg_only_count
             arg_signature = '#'.join(
-                    arg['type']
-                    for arg in option['arguments'][:kwarg_only_count]
-                    if not arg.get('ignore_check'))
+                arg['type']
+                for arg in option['arguments'][:kwarg_only_count]
+                if not arg.get('ignore_check'))
             if kwarg_only_count is None:
                 return arg_signature
             kwarg_only_signature = '#'.join(
-                    arg['name'] + '#' + arg['type']
-                    for arg in option['arguments'][kwarg_only_count:]
-                    if not arg.get('ignore_check'))
+                arg['name'] + '#' + arg['type']
+                for arg in option['arguments'][kwarg_only_count:]
+                if not arg.get('ignore_check'))
             return arg_signature + "#-#" + kwarg_only_signature
         seen_signatures = set()
         unique = []
         for option in options:
-            for num_kwarg_only in range(0, len(option['arguments'])+1):
+            for num_kwarg_only in range(0, len(option['arguments']) + 1):
                 sig = signature(option, num_kwarg_only)
                 if sig not in seen_signatures:
                     if num_kwarg_only > 0:
@@ -55,4 +56,3 @@ class OptionalArguments(CWrapPlugin):
                     seen_signatures.add(sig)
                     break
         return unique
-
diff --git a/tools/cwrap/plugins/ReturnArguments.py b/tools/cwrap/plugins/ReturnArguments.py
index 43a85a558f..651d617d5a 100644
--- a/tools/cwrap/plugins/ReturnArguments.py
+++ b/tools/cwrap/plugins/ReturnArguments.py
@@ -1,9 +1,10 @@
 from . import CWrapPlugin
 from string import Template
 
+
 class ReturnArguments(CWrapPlugin):
-    ARGUMENT_RETURN_TEMPLATE =  Template("Py_INCREF($arg);\nreturn (PyObject*)($arg);")
-    TUPLE_RETURN_TEMPLATE =     Template("return PyTuple_Pack($num_args, $args);")
+    ARGUMENT_RETURN_TEMPLATE = Template("Py_INCREF($arg);\nreturn (PyObject*)($arg);")
+    TUPLE_RETURN_TEMPLATE = Template("return PyTuple_Pack($num_args, $args);")
 
     def initialize(self, cwrap):
         self.cwrap = cwrap
diff --git a/tools/cwrap/plugins/StandaloneExtension.py b/tools/cwrap/plugins/StandaloneExtension.py
index 3442c80251..26407cf655 100644
--- a/tools/cwrap/plugins/StandaloneExtension.py
+++ b/tools/cwrap/plugins/StandaloneExtension.py
@@ -26,41 +26,41 @@ $METHODS
 class StandaloneExtension(CWrapPlugin):
 
     TYPE_UNPACK = {
-        'THFloatTensor*':   Template('THPFloatTensor_CData((THPFloatTensor*)$arg)'),
-        'THDoubleTensor*':  Template('THPDoubleTensor_CData((THPDoubleTensor*)$arg)'),
-        'THLongTensor*':    Template('THPLongTensor_CData((THPLongTensor*)$arg)'),
-        'THIntTensor*':     Template('THPIntTensor_CData((THPIntTensor*)$arg)'),
+        'THFloatTensor*': Template('THPFloatTensor_CData((THPFloatTensor*)$arg)'),
+        'THDoubleTensor*': Template('THPDoubleTensor_CData((THPDoubleTensor*)$arg)'),
+        'THLongTensor*': Template('THPLongTensor_CData((THPLongTensor*)$arg)'),
+        'THIntTensor*': Template('THPIntTensor_CData((THPIntTensor*)$arg)'),
         'THCudaHalfTensor*': Template('THCPHalfTensor_CData((THCPHalfTensor*)$arg)'),
-        'THCudaTensor*':    Template('THCPFloatTensor_CData((THCPFloatTensor*)$arg)'),
+        'THCudaTensor*': Template('THCPFloatTensor_CData((THCPFloatTensor*)$arg)'),
         'THCudaDoubleTensor*': Template('THCPDoubleTensor_CData((THCPDoubleTensor*)$arg)'),
         'THCudaLongTensor*': Template('THCPLongTensor_CData((THCPLongTensor*)$arg)'),
-        'half':             Template('THPHalfUtils_unpackReal($arg)'),
-        'float':            Template('THPFloatUtils_unpackReal($arg)'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
-        'bool':             Template('($arg == Py_True ? true : false)'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
-        'long':             Template('THPUtils_unpackLong($arg)'),
-        'void*':            Template('(void*)THPUtils_unpackLong($arg)'),
-        'THGenerator*':     Template('THPGenerator_CData((THPGenerator*)$arg)'),
+        'half': Template('THPHalfUtils_unpackReal($arg)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'int': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'void*': Template('(void*)THPUtils_unpackLong($arg)'),
+        'THGenerator*': Template('THPGenerator_CData((THPGenerator*)$arg)'),
     }
 
     TYPE_CHECK = {
-        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
-        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
-        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
-        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
         'THCudaHalfTensor*': Template('THCPHalfTensor_Check($arg)'),
-        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
         'THCudaDoubleTensor*': Template('THCPDoubleTensor_Check($arg)'),
         'THCudaLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPLongTensorClass'),
-        'half':             Template('THPHalfUtils_checkReal($arg)'),
-        'float':            Template('THPFloatUtils_checkReal($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'int':              Template('THPUtils_checkLong($arg)'),
-        'long':             Template('THPUtils_checkLong($arg)'),
-        'void*':            Template('THPUtils_checkLong($arg)'),
-        'THGenerator*':     Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'half': Template('THPHalfUtils_checkReal($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
     }
 
     WRAPPER_TEMPLATE = Template("""
@@ -131,6 +131,7 @@ PyObject * $name(PyObject *_unused, PyObject *args)
 
     def get_wrapper_template(self, declaration):
         arg_desc = []
+
         def describe_arg(arg):
             desc = self.TYPE_NAMES[arg['type']] + ' ' + arg['name']
             if arg.get('nullable'):
@@ -138,8 +139,8 @@ PyObject * $name(PyObject *_unused, PyObject *args)
             return desc
         for option in declaration['options']:
             option_desc = [describe_arg(arg)
-                    for arg in option['arguments']
-                    if not arg.get('ignore_check', False)]
+                           for arg in option['arguments']
+                           if not arg.get('ignore_check', False)]
             if option_desc:
                 arg_desc.append('({})'.format(', '.join(option_desc)))
             else:
diff --git a/tools/cwrap/plugins/THPPlugin.py b/tools/cwrap/plugins/THPPlugin.py
index caeab51360..1c4f5d2194 100644
--- a/tools/cwrap/plugins/THPPlugin.py
+++ b/tools/cwrap/plugins/THPPlugin.py
@@ -4,85 +4,86 @@ from . import CWrapPlugin
 from itertools import product, chain
 from collections import OrderedDict
 
+
 class THPPlugin(CWrapPlugin):
 
     TYPE_UNPACK = {
-        'THFloatTensor*':   Template('((THPFloatTensor*)$arg)->cdata'),
-        'THDoubleTensor*':  Template('((THPDoubleTensor*)$arg)->cdata'),
-        'THLongTensor*':    Template('((THPLongTensor*)$arg)->cdata'),
-        'THIntTensor*':     Template('((THPIntTensor*)$arg)->cdata'),
-        'THTensor*':        Template('((THPTensor*)$arg)->cdata'),
-        'THBoolTensor*':    Template('((THPBoolTensor*)$arg)->cdata'),
-        'THIndexTensor*':   Template('((THPIndexTensor*)$arg)->cdata'),
-
-        'THSFloatTensor*':  Template('((THSPFloatTensor*)$arg)->cdata'),
+        'THFloatTensor*': Template('((THPFloatTensor*)$arg)->cdata'),
+        'THDoubleTensor*': Template('((THPDoubleTensor*)$arg)->cdata'),
+        'THLongTensor*': Template('((THPLongTensor*)$arg)->cdata'),
+        'THIntTensor*': Template('((THPIntTensor*)$arg)->cdata'),
+        'THTensor*': Template('((THPTensor*)$arg)->cdata'),
+        'THBoolTensor*': Template('((THPBoolTensor*)$arg)->cdata'),
+        'THIndexTensor*': Template('((THPIndexTensor*)$arg)->cdata'),
+
+        'THSFloatTensor*': Template('((THSPFloatTensor*)$arg)->cdata'),
         'THSDoubleTensor*': Template('((THSPDoubleTensor*)$arg)->cdata'),
-        'THSLongTensor*':   Template('((THSPLongTensor*)$arg)->cdata'),
-        'THSIntTensor*':    Template('((THSPIntTensor*)$arg)->cdata'),
-        'THSTensor*':       Template('((THSPTensor*)$arg)->cdata'),
-        'THSBoolTensor*':   Template('((THSPBoolTensor*)$arg)->cdata'),
-        'THSIndexTensor*':  Template('((THSPIndexTensor*)$arg)->cdata'),
-
-        'THLongStorage*':   Template('((THPLongStorage*)$arg)->cdata'),
-        'THStorage*':       Template('((THPStorage*)$arg)->cdata'),
-        'THGenerator*':     Template('((THPGenerator*)$arg)->cdata'),
-        'THSize*':          Template('__size.get()'),
-        'THStride*':        Template('__stride.get()'),
-        'void*':            Template('THPUtils_unpackLong($arg)'),
-        'long':             Template('THPUtils_unpackLong($arg)'),
-        'int':              Template('THPUtils_unpackLong($arg)'),
-        'bool':             Template('($arg == Py_True ? true : false)'),
-        'float':            Template('THPFloatUtils_unpackReal($arg)'),
-        'double':           Template('THPDoubleUtils_unpackReal($arg)'),
-        'real':             Template('THPUtils_(unpackReal)($arg)'),
-        'accreal':          Template('THPUtils_(unpackAccreal)($arg)'),
+        'THSLongTensor*': Template('((THSPLongTensor*)$arg)->cdata'),
+        'THSIntTensor*': Template('((THSPIntTensor*)$arg)->cdata'),
+        'THSTensor*': Template('((THSPTensor*)$arg)->cdata'),
+        'THSBoolTensor*': Template('((THSPBoolTensor*)$arg)->cdata'),
+        'THSIndexTensor*': Template('((THSPIndexTensor*)$arg)->cdata'),
+
+        'THLongStorage*': Template('((THPLongStorage*)$arg)->cdata'),
+        'THStorage*': Template('((THPStorage*)$arg)->cdata'),
+        'THGenerator*': Template('((THPGenerator*)$arg)->cdata'),
+        'THSize*': Template('__size.get()'),
+        'THStride*': Template('__stride.get()'),
+        'void*': Template('THPUtils_unpackLong($arg)'),
+        'long': Template('THPUtils_unpackLong($arg)'),
+        'int': Template('THPUtils_unpackLong($arg)'),
+        'bool': Template('($arg == Py_True ? true : false)'),
+        'float': Template('THPFloatUtils_unpackReal($arg)'),
+        'double': Template('THPDoubleUtils_unpackReal($arg)'),
+        'real': Template('THPUtils_(unpackReal)($arg)'),
+        'accreal': Template('THPUtils_(unpackAccreal)($arg)'),
     }
 
     TYPE_CHECK = {
-        'THDoubleTensor*':  Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
-        'THFloatTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
-        'THLongTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
-        'THIntTensor*':     Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
-        'THCudaTensor*':    Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
-        'THTensor*':        Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
-        'THBoolTensor*':    Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
-        'THIndexTensor*':   Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
+        'THDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THPDoubleTensorClass'),
+        'THFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THPFloatTensorClass'),
+        'THLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THPLongTensorClass'),
+        'THIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIntTensorClass'),
+        'THCudaTensor*': Template('(PyObject*)Py_TYPE($arg) == THCPFloatTensorClass'),
+        'THTensor*': Template('(PyObject*)Py_TYPE($arg) == THPTensorClass'),
+        'THBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THPBoolTensorClass'),
+        'THIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THPIndexTensorClass'),
 
         'THSDoubleTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPDoubleTensorClass'),
-        'THSFloatTensor*':  Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
-        'THSLongTensor*':   Template('(PyObject*)Py_TYPE($arg) == THSPLongTensorClass'),
-        'THSIntTensor*':    Template('(PyObject*)Py_TYPE($arg) == THSPIntTensorClass'),
-        'THSTensor*':       Template('(PyObject*)Py_TYPE($arg) == THSPTensorClass'),
-        'THSBoolTensor*':   Template('(PyObject*)Py_TYPE($arg) == THSPBoolTensorClass'),
-        'THSIndexTensor*':  Template('(PyObject*)Py_TYPE($arg) == THSPIndexTensorClass'),
-
-        'THLongStorage*':   Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
-        'THStorage*':       Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
-        'THGenerator*':     Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
-        'THSize*':          Template('THPUtils_tryUnpackLongs($arg, __size)'),
-        'THStride*':        Template('THPUtils_tryUnpackLongs($arg, __stride)'),
-        'void*':            Template('THPUtils_checkLong($arg)'),
-        'long':             Template('THPUtils_checkLong($arg)'),
-        'int':              Template('THPUtils_checkLong($arg)'),
-        'bool':             Template('PyBool_Check($arg)'),
-        'float':            Template('THPFloatUtils_checkReal($arg)'),
-        'double':           Template('THPDoubleUtils_checkReal($arg)'),
-        'real':             Template('THPUtils_(checkReal)($arg)'),
-        'accreal':          Template('THPUtils_(checkAccreal)($arg)'),
+        'THSFloatTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPFloatTensorClass'),
+        'THSLongTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPLongTensorClass'),
+        'THSIntTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIntTensorClass'),
+        'THSTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPTensorClass'),
+        'THSBoolTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPBoolTensorClass'),
+        'THSIndexTensor*': Template('(PyObject*)Py_TYPE($arg) == THSPIndexTensorClass'),
+
+        'THLongStorage*': Template('(PyObject*)Py_TYPE($arg) == THPLongStorageClass'),
+        'THStorage*': Template('(PyObject*)Py_TYPE($arg) == THPStorageClass'),
+        'THGenerator*': Template('(PyObject*)Py_TYPE($arg) == THPGeneratorClass'),
+        'THSize*': Template('THPUtils_tryUnpackLongs($arg, __size)'),
+        'THStride*': Template('THPUtils_tryUnpackLongs($arg, __stride)'),
+        'void*': Template('THPUtils_checkLong($arg)'),
+        'long': Template('THPUtils_checkLong($arg)'),
+        'int': Template('THPUtils_checkLong($arg)'),
+        'bool': Template('PyBool_Check($arg)'),
+        'float': Template('THPFloatUtils_checkReal($arg)'),
+        'double': Template('THPDoubleUtils_checkReal($arg)'),
+        'real': Template('THPUtils_(checkReal)($arg)'),
+        'accreal': Template('THPUtils_(checkAccreal)($arg)'),
     }
 
     SIZE_VARARG_CHECK = Template('THPUtils_tryUnpackLongVarArgs(args, $idx, __size)')
 
     RETURN_WRAPPER = {
-        'THTensor*':        Template('return THPTensor_(New)($result);'),
-        'THSTensor*':       Template('return THSPTensor_(New)($result);'),
-        'THLongTensor*':    Template('return THPLongTensor_New($result);'),
-        'THLongStorage*':   Template('return THPLongStorage_New($result);'),
+        'THTensor*': Template('return THPTensor_(New)($result);'),
+        'THSTensor*': Template('return THSPTensor_(New)($result);'),
+        'THLongTensor*': Template('return THPLongTensor_New($result);'),
+        'THLongStorage*': Template('return THPLongStorage_New($result);'),
         # TODO: make it smarter - it should return python long if result doesn't fit into an int
-        'long':             Template('return PyInt_FromLong($result);'),
-        'accreal':          Template('return THPUtils_(newAccreal)($result);'),
-        'self':             Template('Py_INCREF(self);\nreturn (PyObject*)self;'),
-        'real':             Template('return THPUtils_(newReal)($result);'),
+        'long': Template('return PyInt_FromLong($result);'),
+        'accreal': Template('return THPUtils_(newAccreal)($result);'),
+        'self': Template('Py_INCREF(self);\nreturn (PyObject*)self;'),
+        'real': Template('return THPUtils_(newReal)($result);'),
     }
 
     TENSOR_METHODS_DECLARATION = Template("""
@@ -138,13 +139,13 @@ ${cpu}
         return Template(code)
 
     ALLOCATE_TYPE = {
-        'THTensor*':        _allocate('', ALLOCATE_TMPL),
-        'THLongTensor*':    _allocate('Long', ALLOCATE_TMPL),
-        'THIntTensor*':     _allocate('Int', ALLOCATE_TMPL),
-        'THBoolTensor*':    _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
-        'THIndexTensor*':   _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THTensor*': _allocate('', ALLOCATE_TMPL),
+        'THLongTensor*': _allocate('Long', ALLOCATE_TMPL),
+        'THIntTensor*': _allocate('Int', ALLOCATE_TMPL),
+        'THBoolTensor*': _allocate('Byte', ALLOCATE_TMPL, ALLOCATE_CUDA),
+        'THIndexTensor*': _allocate('Long', ALLOCATE_TMPL, ALLOCATE_CUDA),
 
-        'THSTensor*':       _allocate('', ALLOCATE_TMPL, sparse=True),
+        'THSTensor*': _allocate('', ALLOCATE_TMPL, sparse=True),
     }
 
     TYPE_NAMES = {
@@ -205,7 +206,7 @@ ${cpu}
                 if len(output_args) > 1:
                     out_type = 'tuple['
                     out_type += ', '.join(
-                            self.TYPE_NAMES[arg['type']] for arg in output_args)
+                        self.TYPE_NAMES[arg['type']] for arg in output_args)
                     out_type += ']'
                     option_desc += ['#' + out_type + ' out']
                 else:
@@ -287,7 +288,7 @@ ${cpu}
                     if not output_provided:
                         arg['ignore_check'] = True
                     else:
-                        option_copy['argcount_offset'] =  -len(out_idx) + 1
+                        option_copy['argcount_offset'] = -len(out_idx) + 1
                         arg['no_kwargs'] = True
                         arg['no_idx'] = True
                 new_options.append(option_copy)
@@ -345,7 +346,6 @@ ${cpu}
                     if arg['name'] == 'self':
                         arg['ignore_check'] = True
 
-
         declarations = [d for d in declarations if not d.get('only_stateless', False)]
         self.declarations.extend(filter(lambda x: not x.get('only_stateless', False), register_only))
         self.stateless_declarations.extend(filter(lambda x: x.get('only_stateless', False), register_only))
@@ -377,9 +377,9 @@ ${cpu}
             if declaration.get('override_method_flags'):
                 flags = declaration['override_method_flags']
             entry = Template('  {"$python_name", (PyCFunction)$name, $flags, $docstring},\n').substitute(
-                    python_name=declaration['python_name'], name=declaration['name'], flags=flags,
-                    docstring=declaration.get('docstring_var', 'NULL')
-                )
+                python_name=declaration['python_name'], name=declaration['name'], flags=flags,
+                docstring=declaration.get('docstring_var', 'NULL')
+            )
             if 'defined_if' in declaration:
                 entry = self.preprocessor_guard(entry, declaration['defined_if'])
             tensor_methods += entry
@@ -401,7 +401,7 @@ ${cpu}
                 )
 
     def preprocessor_guard(self, code, condition):
-            return '#if ' + condition + '\n' + code + '#endif\n'
+        return '#if ' + condition + '\n' + code + '#endif\n'
 
     def process_wrapper(self, code, declaration):
         if 'defined_if' in declaration:
@@ -419,7 +419,7 @@ ${cpu}
                 if option['output_count'] > 1:
                     checks += "PyTuple_Check(__out) &&\n" + indent
                     length_check = "PyTuple_GET_SIZE(__out) == {} &&\n".format(
-                            option['output_count'])
+                        option['output_count'])
                     checks += length_check + indent
                 code = checks + code
             else:
@@ -443,13 +443,13 @@ ${cpu}
     def generate_docstrings_cpp(self):
         template = Template('char* $name = "$content";')
         return '\n\n'.join(
-                template.substitute(name=decl['docstring_var'], content=decl['docstring_content'])
-                for decl in chain(self.declarations, self.stateless_declarations)
-                if 'docstring_var' in decl)
+            template.substitute(name=decl['docstring_var'], content=decl['docstring_content'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)
 
     def generate_docstrings_h(self):
         template = Template('extern char* $name;')
         return '\n\n'.join(
-                template.substitute(name=decl['docstring_var'])
-                for decl in chain(self.declarations, self.stateless_declarations)
-                if 'docstring_var' in decl)
+            template.substitute(name=decl['docstring_var'])
+            for decl in chain(self.declarations, self.stateless_declarations)
+            if 'docstring_var' in decl)
diff --git a/tools/nnwrap/generate_wrappers.py b/tools/nnwrap/generate_wrappers.py
index 9520cfcac7..d5ca74d9e0 100644
--- a/tools/nnwrap/generate_wrappers.py
+++ b/tools/nnwrap/generate_wrappers.py
@@ -8,6 +8,7 @@ BASE_PATH = os.path.realpath(os.path.join(__file__, '..', '..', '..'))
 WRAPPER_PATH = os.path.join(BASE_PATH, 'torch', 'csrc', 'nn')
 THNN_UTILS_PATH = os.path.join(BASE_PATH, 'torch', '_thnn', 'utils.py')
 
+
 def import_module(name, path):
     if sys.version_info >= (3, 5):
         import importlib.util
@@ -81,7 +82,8 @@ for t in ['CudaHalf', 'Cuda', 'CudaDouble']:
 def wrap_function(name, type, arguments):
     cname = 'THNN_' + type + name
     declaration = ''
-    declaration += 'extern "C" void ' + cname + '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type) for arg in arguments) + ');\n'
+    declaration += 'extern "C" void ' + cname + \
+        '(' + ', '.join(TYPE_TRANSFORMS[type].get(arg.type, arg.type) for arg in arguments) + ');\n'
     declaration += FUNCTION_TEMPLATE.substitute(name=type + name, cname=cname)
     indent = ' ' * 4
     dict_indent = ' ' * 6
@@ -92,15 +94,17 @@ def wrap_function(name, type, arguments):
         else:
             t = TYPE_TRANSFORMS[type].get(arg.type, arg.type)
             declaration += prefix + 'type: ' + t        + '\n' + \
-                      dict_indent + 'name: ' + arg.name + '\n' + \
-                      dict_indent + 'nullable: True' + '\n'
+                dict_indent + 'name: ' + arg.name + '\n' + \
+                dict_indent + 'nullable: True' + '\n'
     declaration += ']]\n\n\n'
     return declaration
 
+
 def generate_wrappers():
     wrap_nn()
     wrap_cunn()
 
+
 def wrap_nn():
     wrapper = '#include <TH/TH.h>\n\n\n'
     nn_functions = thnn_utils.parse_header(thnn_utils.THNN_H_PATH)
@@ -114,6 +118,7 @@ def wrap_nn():
         NullableArguments(),
     ])
 
+
 def wrap_cunn():
     wrapper = '#include <TH/TH.h>\n'
     wrapper += '#include <THC/THC.h>\n\n\n'
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index 70db036899..b7a555b651 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -1,4 +1,5 @@
 import os
 
+
 def check_env_flag(name):
     return os.getenv(name) in ['ON', '1', 'YES', 'TRUE', 'Y']
diff --git a/torch/__init__.py b/torch/__init__.py
index b75a8c0478..021fee57a7 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -56,6 +56,7 @@ del old_flags
 # Define basic utilities
 ################################################################################
 
+
 def typename(o):
     module = ''
     class_name = ''
@@ -91,7 +92,7 @@ def set_default_tensor_type(t):
 
 def set_rng_state(new_state):
     r"""Sets the random number generator state.
-    
+
     Args:
         new_state (torch.ByteTensor): The desired state
     """
@@ -106,7 +107,7 @@ def get_rng_state():
 def manual_seed(seed):
     r"""Sets the seed for generating random numbers. And returns a 
     `torch._C.Generator` object.
-    
+
     Args:
         seed (int or long): The desired seed.
     """
@@ -130,61 +131,101 @@ from ._tensor_str import set_printoptions
 from .storage import _StorageBase
 from .tensor import _TensorBase
 
+
 class DoubleStorage(_C.DoubleStorageBase, _StorageBase):
     pass
+
+
 class FloatStorage(_C.FloatStorageBase, _StorageBase):
     pass
+
+
 class LongStorage(_C.LongStorageBase, _StorageBase):
     pass
+
+
 class IntStorage(_C.IntStorageBase, _StorageBase):
     pass
+
+
 class ShortStorage(_C.ShortStorageBase, _StorageBase):
     pass
+
+
 class CharStorage(_C.CharStorageBase, _StorageBase):
     pass
+
+
 class ByteStorage(_C.ByteStorageBase, _StorageBase):
     pass
 
+
 class DoubleTensor(_C.DoubleTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return DoubleStorage
+
+
 class FloatTensor(_C.FloatTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return FloatStorage
+
+
 class LongTensor(_C.LongTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return LongStorage
+
+
 class IntTensor(_C.IntTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return IntStorage
+
+
 class ShortTensor(_C.ShortTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return ShortStorage
+
+
 class CharTensor(_C.CharTensorBase, _TensorBase):
+
     def is_signed(self):
         # TODO
         return False
+
     @classmethod
     def storage_type(cls):
         return CharStorage
+
+
 class ByteTensor(_C.ByteTensorBase, _TensorBase):
+
     def is_signed(self):
         return False
+
     @classmethod
     def storage_type(cls):
         return ByteStorage
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index a4e6cfc808..caae3905ef 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -4,133 +4,133 @@ import torch._C
 from torch._C import _add_docstr as add_docstr
 
 add_docstr(torch._C.FloatTensorBase.abs,
-"""
+           """
 abs() -> Tensor
 
 See :func:`torch.abs`
 """)
 
 add_docstr(torch._C.FloatTensorBase.abs_,
-"""
+           """
 abs_() -> Tensor
 
 In-place version of :meth:`~Tensor.abs`
 """)
 
 add_docstr(torch._C.FloatTensorBase.acos,
-"""
+           """
 acos() -> Tensor
 
 See :func:`torch.acos`
 """)
 
 add_docstr(torch._C.FloatTensorBase.acos_,
-"""
+           """
 acos_() -> Tensor
 
 In-place version of :meth:`~Tensor.acos`
 """)
 
 add_docstr(torch._C.FloatTensorBase.add,
-"""
+           """
 add(value)
 
 See :func:`torch.add`
 """)
 
 add_docstr(torch._C.FloatTensorBase.add_,
-"""
+           """
 add_(value)
 
 In-place version of :meth:`~Tensor.add`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addbmm,
-"""
+           """
 addbmm(beta=1, mat, alpha=1, batch1, batch2) -> Tensor
 
 See :func:`torch.addbmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addbmm_,
-"""
+           """
 addbmm_(beta=1, mat, alpha=1, batch1, batch2) -> Tensor
 
 In-place version of :meth:`~Tensor.addbmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addcdiv,
-"""
+           """
 addcdiv(value=1, tensor1, tensor2) -> Tensor
 
 See :func:`torch.addcdiv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addcdiv_,
-"""
+           """
 addcdiv_(value=1, tensor1, tensor2) -> Tensor
 
 In-place version of :meth:`~Tensor.addcdiv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addcmul,
-"""
+           """
 addcmul(value=1, tensor1, tensor2) -> Tensor
 
 See :func:`torch.addcmul`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addcmul_,
-"""
+           """
 addcmul_(value=1, tensor1, tensor2) -> Tensor
 
 In-place version of :meth:`~Tensor.addcmul`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addmm,
-"""
+           """
 addmm(beta=1, mat, alpha=1, mat1, mat2) -> Tensor
 
 See :func:`torch.addmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addmm_,
-"""
+           """
 addmm_(beta=1, mat, alpha=1, mat1, mat2) -> Tensor
 
 In-place version of :meth:`~Tensor.addmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addmv,
-"""
+           """
 addmv(beta=1, tensor, alpha=1, mat, vec) -> Tensor
 
 See :func:`torch.addmv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addmv_,
-"""
+           """
 addmv_(beta=1, tensor, alpha=1, mat, vec) -> Tensor
 
 In-place version of :meth:`~Tensor.addmv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addr,
-"""
+           """
 addr(beta=1, alpha=1, vec1, vec2) -> Tensor
 
 See :func:`torch.addr`
 """)
 
 add_docstr(torch._C.FloatTensorBase.addr_,
-"""
+           """
 addr_(beta=1, alpha=1, vec1, vec2) -> Tensor
 
 In-place version of :meth:`~Tensor.addr`
 """)
 
 add_docstr(torch._C.FloatTensorBase.apply_,
-"""
+           """
 apply_(callable) -> Tensor
 
 Applies the function :attr:`callable` to each element in the tensor, replacing
@@ -143,84 +143,84 @@ each element with the value returned by :attr:`callable`.
 """)
 
 add_docstr(torch._C.FloatTensorBase.asin,
-"""
+           """
 asin() -> Tensor
 
 See :func:`torch.asin`
 """)
 
 add_docstr(torch._C.FloatTensorBase.asin_,
-"""
+           """
 asin_() -> Tensor
 
 In-place version of :meth:`~Tensor.asin`
 """)
 
 add_docstr(torch._C.FloatTensorBase.atan,
-"""
+           """
 atan() -> Tensor
 
 See :func:`torch.atan`
 """)
 
 add_docstr(torch._C.FloatTensorBase.atan2,
-"""
+           """
 atan2(other) -> Tensor
 
 See :func:`torch.atan2`
 """)
 
 add_docstr(torch._C.FloatTensorBase.atan2_,
-"""
+           """
 atan2_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.atan2`
 """)
 
 add_docstr(torch._C.FloatTensorBase.atan_,
-"""
+           """
 atan_() -> Tensor
 
 In-place version of :meth:`~Tensor.atan`
 """)
 
 add_docstr(torch._C.FloatTensorBase.baddbmm,
-"""
+           """
 baddbmm(beta=1, alpha=1, batch1, batch2) -> Tensor
 
 See :func:`torch.baddbmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.baddbmm_,
-"""
+           """
 baddbmm_(beta=1, alpha=1, batch1, batch2) -> Tensor
 
 In-place version of :meth:`~Tensor.baddbmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.bernoulli,
-"""
+           """
 bernoulli() -> Tensor
 
 See :func:`torch.bernoulli`
 """)
 
 add_docstr(torch._C.FloatTensorBase.bernoulli_,
-"""
+           """
 bernoulli_() -> Tensor
 
 In-place version of :meth:`~Tensor.bernoulli`
 """)
 
 add_docstr(torch._C.FloatTensorBase.bmm,
-"""
+           """
 bmm(batch2) -> Tensor
 
 See :func:`torch.bmm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cauchy_,
-"""
+           """
 cauchy_(generator=None, median=0, sigma=1) -> Tensor
 
 Fills the tensor with numbers drawn from the Cauchy distribution:
@@ -231,35 +231,35 @@ Fills the tensor with numbers drawn from the Cauchy distribution:
 """)
 
 add_docstr(torch._C.FloatTensorBase.ceil,
-"""
+           """
 ceil() -> Tensor
 
 See :func:`torch.ceil`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ceil_,
-"""
+           """
 ceil_() -> Tensor
 
 In-place version of :meth:`~Tensor.ceil`
 """)
 
 add_docstr(torch._C.FloatTensorBase.clamp,
-"""
+           """
 clamp(min, max) -> Tensor
 
 See :func:`torch.clamp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.clamp_,
-"""
+           """
 clamp_(min, max) -> Tensor
 
 In-place version of :meth:`~Tensor.clamp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.clone,
-"""
+           """
 clone() -> Tensor
 
 Returns a copy of the tensor. The copy has the same size and data type as the
@@ -267,7 +267,7 @@ original tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.contiguous,
-"""
+           """
 contiguous() -> Tensor
 
 Returns a contiguous Tensor containing the same data as this tensor. If this
@@ -275,7 +275,7 @@ tensor is contiguous, this function returns the original tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.copy_,
-"""
+           """
 copy_(src, async=False) -> Tensor
 
 Copies the elements from :attr:`src` into this tensor and returns this tensor.
@@ -291,112 +291,112 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.cos,
-"""
+           """
 cos() -> Tensor
 
 See :func:`torch.cos`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cos_,
-"""
+           """
 cos_() -> Tensor
 
 In-place version of :meth:`~Tensor.cos`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cosh,
-"""
+           """
 cosh() -> Tensor
 
 See :func:`torch.cosh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cosh_,
-"""
+           """
 cosh_() -> Tensor
 
 In-place version of :meth:`~Tensor.cosh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cross,
-"""
+           """
 cross(other, dim=-1) -> Tensor
 
 See :func:`torch.cross`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cumprod,
-"""
+           """
 cumprod(dim) -> Tensor
 
 See :func:`torch.cumprod`
 """)
 
 add_docstr(torch._C.FloatTensorBase.cumsum,
-"""
+           """
 cumsum(dim) -> Tensor
 
 See :func:`torch.cumsum`
 """)
 
 add_docstr(torch._C.FloatTensorBase.data_ptr,
-"""
+           """
 data_ptr() -> int
 
 Returns the address of the first element of this tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.diag,
-"""
+           """
 diag(diagonal=0) -> Tensor
 
 See :func:`torch.diag`
 """)
 
 add_docstr(torch._C.FloatTensorBase.dim,
-"""
+           """
 dim() -> int
 
 Returns the number of dimensions of this tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.dist,
-"""
+           """
 dist(other, p=2) -> Tensor
 
 See :func:`torch.dist`
 """)
 
 add_docstr(torch._C.FloatTensorBase.div,
-"""
+           """
 div(value)
 
 See :func:`torch.div`
 """)
 
 add_docstr(torch._C.FloatTensorBase.div_,
-"""
+           """
 div_(value)
 
 In-place version of :meth:`~Tensor.div`
 """)
 
 add_docstr(torch._C.FloatTensorBase.dot,
-"""
+           """
 dot(tensor2) -> float
 
 See :func:`torch.dot`
 """)
 
 add_docstr(torch._C.FloatTensorBase.eig,
-"""
+           """
 eig(eigenvectors=False) -> (Tensor, Tensor)
 
 See :func:`torch.eig`
 """)
 
 add_docstr(torch._C.FloatTensorBase.element_size,
-"""
+           """
 element_size() -> int
 
 Returns the size in bytes of an individual element.
@@ -409,42 +409,42 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.eq,
-"""
+           """
 eq(other) -> Tensor
 
 See :func:`torch.eq`
 """)
 
 add_docstr(torch._C.FloatTensorBase.eq_,
-"""
+           """
 eq_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.eq`
 """)
 
 add_docstr(torch._C.FloatTensorBase.equal,
-"""
+           """
 equal(other) -> bool
 
 See :func:`torch.equal`
 """)
 
 add_docstr(torch._C.FloatTensorBase.exp,
-"""
+           """
 exp() -> Tensor
 
 See :func:`torch.exp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.exp_,
-"""
+           """
 exp_() -> Tensor
 
 In-place version of :meth:`~Tensor.exp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.exponential_,
-"""
+           """
 exponential_(generator=None, lambd=1) -> Tensor
 
 Fills this tensor with elements drawn from the exponential distribution:
@@ -455,84 +455,84 @@ Fills this tensor with elements drawn from the exponential distribution:
 """)
 
 add_docstr(torch._C.FloatTensorBase.fill_,
-"""
+           """
 fill_(value) -> Tensor
 
 Fills this tensor with the specified value.
 """)
 
 add_docstr(torch._C.FloatTensorBase.floor,
-"""
+           """
 floor() -> Tensor
 
 See :func:`torch.floor`
 """)
 
 add_docstr(torch._C.FloatTensorBase.floor_,
-"""
+           """
 floor_() -> Tensor
 
 In-place version of :meth:`~Tensor.floor`
 """)
 
 add_docstr(torch._C.FloatTensorBase.fmod,
-"""
+           """
 fmod(divisor) -> Tensor
 
 See :func:`torch.fmod`
 """)
 
 add_docstr(torch._C.FloatTensorBase.fmod_,
-"""
+           """
 fmod_(divisor) -> Tensor
 
 In-place version of :meth:`~Tensor.fmod`
 """)
 
 add_docstr(torch._C.FloatTensorBase.frac,
-"""
+           """
 frac() -> Tensor
 
 See :func:`torch.frac`
 """)
 
 add_docstr(torch._C.FloatTensorBase.frac_,
-"""
+           """
 frac_() -> Tensor
 
 In-place version of :meth:`~Tensor.frac`
 """)
 
 add_docstr(torch._C.FloatTensorBase.gather,
-"""
+           """
 gather(dim, index) -> Tensor
 
 See :func:`torch.gather`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ge,
-"""
+           """
 ge(other) -> Tensor
 
 See :func:`torch.ge`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ge_,
-"""
+           """
 ge_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.ge`
 """)
 
 add_docstr(torch._C.FloatTensorBase.gels,
-"""
+           """
 gels(A) -> Tensor
 
 See :func:`torch.gels`
 """)
 
 add_docstr(torch._C.FloatTensorBase.geometric_,
-"""
+           """
 geometric_(generator=None, p) -> Tensor
 
 Fills this tensor with elements drawn from the geometric distribution:
@@ -544,49 +544,49 @@ Fills this tensor with elements drawn from the geometric distribution:
 """)
 
 add_docstr(torch._C.FloatTensorBase.geqrf,
-"""
+           """
 geqrf() -> (Tensor, Tensor)
 
 See :func:`torch.geqrf`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ger,
-"""
+           """
 ger(vec2) -> Tensor
 
 See :func:`torch.ger`
 """)
 
 add_docstr(torch._C.FloatTensorBase.gesv,
-"""
+           """
 gesv(A) -> Tensor, Tensor
 
 See :func:`torch.gesv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.gt,
-"""
+           """
 gt(other) -> Tensor
 
 See :func:`torch.gt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.gt_,
-"""
+           """
 gt_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.gt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.histc,
-"""
+           """
 histc(bins=100, min=0, max=0) -> Tensor
 
 See :func:`torch.histc`
 """)
 
 add_docstr(torch._C.FloatTensorBase.index,
-"""
+           """
 index(m) -> Tensor
 
 Selects elements from this tensor using a binary mask or along a given
@@ -597,7 +597,7 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.index_add_,
-"""
+           """
 index_add_(dim, index, tensor) -> Tensor
 
 Accumulate the elements of tensor into the original tensor by adding to the
@@ -622,7 +622,7 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.index_copy_,
-"""
+           """
 index_copy_(dim, index, tensor) -> Tensor
 
 Copies the elements of tensor into the original tensor by selecting the
@@ -647,7 +647,7 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.index_fill_,
-"""
+           """
 index_fill_(dim, index, tensor) -> Tensor
 
 Fills the elements of the original tensor with value :attr:`val` by selecting
@@ -670,28 +670,28 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.index_select,
-"""
+           """
 index_select(dim, index) -> Tensor
 
 See :func:`torch.index_select`
 """)
 
 add_docstr(torch._C.FloatTensorBase.inverse,
-"""
+           """
 inverse() -> Tensor
 
 See :func:`torch.inverse`
 """)
 
 add_docstr(torch._C.FloatTensorBase.is_contiguous,
-"""
+           """
 is_contiguous() -> bool
 
 Returns True if this tensor is contiguous in memory in C order.
 """)
 
 add_docstr(torch._C.FloatTensorBase.is_set_to,
-"""
+           """
 is_set_to(tensor) -> bool
 
 Returns True if this object refers to the same ``THTensor`` object from the
@@ -699,56 +699,56 @@ Torch C API as the given tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.kthvalue,
-"""
+           """
 kthvalue(k, dim=None) -> (Tensor, LongTensor)
 
 See :func:`torch.kthvalue`
 """)
 
 add_docstr(torch._C.FloatTensorBase.le,
-"""
+           """
 le(other) -> Tensor
 
 See :func:`torch.le`
 """)
 
 add_docstr(torch._C.FloatTensorBase.le_,
-"""
+           """
 le_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.le`
 """)
 
 add_docstr(torch._C.FloatTensorBase.lerp,
-"""
+           """
 lerp(start, end, weight)
 
 See :func:`torch.lerp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.lerp_,
-"""
+           """
 lerp_(start, end, weight)
 
 In-place version of :meth:`~Tensor.lerp`
 """)
 
 add_docstr(torch._C.FloatTensorBase.log,
-"""
+           """
 log() -> Tensor
 
 See :func:`torch.log`
 """)
 
 add_docstr(torch._C.FloatTensorBase.log1p,
-"""
+           """
 log1p() -> Tensor
 
 See :func:`torch.log1p`
 """)
 
 add_docstr(torch._C.FloatTensorBase.log1p_,
-"""
+           """
 log1p_() -> Tensor
 
 In-place version of :meth:`~Tensor.log1p`
@@ -774,21 +774,21 @@ underlying normal distribution, and not of the returned distribution:
 """)
 
 add_docstr(torch._C.FloatTensorBase.lt,
-"""
+           """
 lt(other) -> Tensor
 
 See :func:`torch.lt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.lt_,
-"""
+           """
 lt_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.lt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.map_,
-"""
+           """
 map_(tensor, callable)
 
 Applies :attr:`callable` for each element in this tensor and the given tensor
@@ -799,7 +799,7 @@ signature::
 """)
 
 add_docstr(torch._C.FloatTensorBase.masked_copy_,
-"""
+           """
 masked_copy_(mask, source)
 
 Copies elements from :attr:`source` into this tensor at positions where the
@@ -818,7 +818,7 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.masked_fill_,
-"""
+           """
 masked_fill_(mask, value)
 
 Fills elements of this tensor with :attr:`value` where :attr:`mask` is one.
@@ -831,84 +831,84 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.masked_select,
-"""
+           """
 masked_select(mask) -> Tensor
 
 See :func:`torch.masked_select`
 """)
 
 add_docstr(torch._C.FloatTensorBase.max,
-"""
+           """
 max(dim=None) -> float or (Tensor, Tensor)
 
 See :func:`torch.max`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mean,
-"""
+           """
 mean(dim=None) -> float or (Tensor, Tensor)
 
 See :func:`torch.mean`
 """)
 
 add_docstr(torch._C.FloatTensorBase.median,
-"""
+           """
 median(dim=-1, values=None, indices=None) -> (Tensor, LongTensor)
 
 See :func:`torch.median`
 """)
 
 add_docstr(torch._C.FloatTensorBase.min,
-"""
+           """
 min(dim=None) -> float or (Tensor, Tensor)
 
 See :func:`torch.min`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mm,
-"""
+           """
 mm(mat2) -> Tensor
 
 See :func:`torch.mm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mode,
-"""
+           """
 mode(dim=-1, values=None, indices=None) -> (Tensor, LongTensor)
 
 See :func:`torch.mode`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mul,
-"""
+           """
 mul(value) -> Tensor
 
 See :func:`torch.mul`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mul_,
-"""
+           """
 mul_(value)
 
 In-place version of :meth:`~Tensor.mul`
 """)
 
 add_docstr(torch._C.FloatTensorBase.multinomial,
-"""
+           """
 multinomial(generator=None, num_samples, replacement=False)
 
 See :func:`torch.multinomial`
 """)
 
 add_docstr(torch._C.FloatTensorBase.mv,
-"""
+           """
 mv(vec) -> Tensor
 
 See :func:`torch.mv`
 """)
 
 add_docstr(torch._C.FloatTensorBase.narrow,
-"""
+           """
 narrow(dimension, start, length) -> Tensor
 
 Returns a new tensor that is a narrowed version of this tensor. The dimension
@@ -934,63 +934,63 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.ndimension,
-"""
+           """
 ndimension() -> int
 
 Alias for :meth:`~Tensor.dim()`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ne,
-"""
+           """
 ne(other) -> Tensor
 
 See :func:`torch.ne`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ne_,
-"""
+           """
 ne_(other) -> Tensor
 
 In-place version of :meth:`~Tensor.ne`
 """)
 
 add_docstr(torch._C.FloatTensorBase.neg,
-"""
+           """
 neg() -> Tensor
 
 See :func:`torch.neg`
 """)
 
 add_docstr(torch._C.FloatTensorBase.neg_,
-"""
+           """
 neg_() -> Tensor
 
 In-place version of :meth:`~Tensor.neg`
 """)
 
 add_docstr(torch._C.FloatTensorBase.nelement,
-"""
+           """
 nelement() -> int
 
 Alias for :meth:`~Tensor.numel`
 """)
 
 add_docstr(torch._C.FloatTensorBase.nonzero,
-"""
+           """
 nonzero() -> LongTensor
 
 See :func:`torch.nonzero`
 """)
 
 add_docstr(torch._C.FloatTensorBase.norm,
-"""
+           """
 norm(p=2) -> float
 
 See :func:`torch.norm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.normal_,
-"""
+           """
 normal_(generator=None, mean=0, var=1)
 
 Fills this tensor with elements samples from the normal distribution
@@ -998,14 +998,14 @@ parameterized by :attr:`mean` and :attr:`var`.
 """)
 
 add_docstr(torch._C.FloatTensorBase.numel,
-"""
+           """
 numel() -> int
 
 See :func:`torch.numel`
 """)
 
 add_docstr(torch._C.FloatTensorBase.numpy,
-"""
+           """
 numpy() -> ndarray
 
 Returns this tensor as a NumPy :class:`ndarray`. This tensor and the returned
@@ -1014,77 +1014,77 @@ be reflected in the :class:`ndarray` and vice versa.
 """)
 
 add_docstr(torch._C.FloatTensorBase.orgqr,
-"""
+           """
 orgqr(input2) -> Tensor
 
 See :func:`torch.orgqr`
 """)
 
 add_docstr(torch._C.FloatTensorBase.ormqr,
-"""
+           """
 ormqr(input2, input3, left=True, transpose=False) -> Tensor
 
 See :func:`torch.ormqr`
 """)
 
 add_docstr(torch._C.FloatTensorBase.potrf,
-"""
+           """
 potrf(upper=True) -> Tensor
 
 See :func:`torch.potrf`
 """)
 
 add_docstr(torch._C.FloatTensorBase.potri,
-"""
+           """
 potri(upper=True) -> Tensor
 
 See :func:`torch.potri`
 """)
 
 add_docstr(torch._C.FloatTensorBase.potrs,
-"""
+           """
 potrs(input2, upper=True) -> Tensor
 
 See :func:`torch.potrs`
 """)
 
 add_docstr(torch._C.FloatTensorBase.pow,
-"""
+           """
 pow(exponent)
 
 See :func:`torch.pow`
 """)
 
 add_docstr(torch._C.FloatTensorBase.pow_,
-"""
+           """
 pow_(exponent)
 
 In-place version of :meth:`~Tensor.pow`
 """)
 
 add_docstr(torch._C.FloatTensorBase.prod,
-"""
+           """
 prod() -> float
 
 See :func:`torch.prod`
 """)
 
 add_docstr(torch._C.FloatTensorBase.pstrf,
-"""
+           """
 pstrf(upper=True, tol=-1) -> (Tensor, IntTensor)
 
 See :func:`torch.pstrf`
 """)
 
 add_docstr(torch._C.FloatTensorBase.qr,
-"""
+           """
 qr() -> (Tensor, Tensor)
 
 See :func:`torch.qr`
 """)
 
 add_docstr(torch._C.FloatTensorBase.random_,
-"""
+           """
 random_(generator=None, from=0, to=None)
 
 Fills this tensor with numbers sampled from the uniform distribution or
@@ -1093,49 +1093,49 @@ defaults to the largest value representable by this tensor's data type.
 """)
 
 add_docstr(torch._C.FloatTensorBase.reciprocal,
-"""
+           """
 reciprocal() -> Tensor
 
 See :func:`torch.reciprocal`
 """)
 
 add_docstr(torch._C.FloatTensorBase.reciprocal_,
-"""
+           """
 reciprocal_() -> Tensor
 
 In-place version of :meth:`~Tensor.reciprocal`
 """)
 
 add_docstr(torch._C.FloatTensorBase.remainder,
-"""
+           """
 remainder(divisor) -> Tensor
 
 See :func:`torch.remainder`
 """)
 
 add_docstr(torch._C.FloatTensorBase.remainder_,
-"""
+           """
 remainder_(divisor) -> Tensor
 
 In-place version of :meth:`~Tensor.remainder`
 """)
 
 add_docstr(torch._C.FloatTensorBase.renorm,
-"""
+           """
 renorm(p, dim, maxnorm) -> Tensor
 
 See :func:`torch.renorm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.renorm_,
-"""
+           """
 renorm_(p, dim, maxnorm) -> Tensor
 
 In-place version of :meth:`~Tensor.renorm`
 """)
 
 add_docstr(torch._C.FloatTensorBase.resize_,
-"""
+           """
 resize_(*sizes)
 
 Resizes this tensor to the specified size. If the number of elements is
@@ -1157,7 +1157,7 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.resize_as_,
-"""
+           """
 resize_as_(tensor)
 
 Resizes the current tensor to be the same size as the specified tensor. This is
@@ -1167,35 +1167,35 @@ equivalent to::
 """)
 
 add_docstr(torch._C.FloatTensorBase.round,
-"""
+           """
 round() -> Tensor
 
 See :func:`torch.round`
 """)
 
 add_docstr(torch._C.FloatTensorBase.round_,
-"""
+           """
 round_() -> Tensor
 
 In-place version of :meth:`~Tensor.round`
 """)
 
 add_docstr(torch._C.FloatTensorBase.rsqrt,
-"""
+           """
 rsqrt() -> Tensor
 
 See :func:`torch.rsqrt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.rsqrt_,
-"""
+           """
 rsqrt_() -> Tensor
 
 In-place version of :meth:`~Tensor.rsqrt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.scatter_,
-"""
+           """
 scatter_(input, dim, index, src) -> Tensor
 
 Writes all values from the Tensor :attr:`src` into self at the indices specified
@@ -1237,7 +1237,7 @@ Example::
 """)
 
 add_docstr(torch._C.FloatTensorBase.select,
-"""
+           """
 select(dim, index) -> Tensor or number
 
 Slices the tensor along the selected dimension at the given index. If this
@@ -1256,7 +1256,7 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.set_,
-"""
+           """
 set_(source=None, storage_offset=0, size=None, stride=None)
 
 Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
@@ -1275,70 +1275,70 @@ Args:
 """)
 
 add_docstr(torch._C.FloatTensorBase.set_index,
-"""
+           """
 set_index(index, value)
 
 Alias for ``self[index] = value``
 """)
 
 add_docstr(torch._C.FloatTensorBase.sigmoid,
-"""
+           """
 sigmoid() -> Tensor
 
 See :func:`torch.sigmoid`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sigmoid_,
-"""
+           """
 sigmoid_() -> Tensor
 
 In-place version of :meth:`~Tensor.sigmoid`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sign,
-"""
+           """
 sign() -> Tensor
 
 See :func:`torch.sign`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sign_,
-"""
+           """
 sign_() -> Tensor
 
 In-place version of :meth:`~Tensor.sign`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sin,
-"""
+           """
 sin() -> Tensor
 
 See :func:`torch.sin`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sin_,
-"""
+           """
 sin_() -> Tensor
 
 In-place version of :meth:`~Tensor.sin`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sinh,
-"""
+           """
 sinh() -> Tensor
 
 See :func:`torch.sinh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sinh_,
-"""
+           """
 sinh_() -> Tensor
 
 In-place version of :meth:`~Tensor.sinh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.size,
-"""
+           """
 size() -> torch.Size
 
 Returns the size of the tensor. The returned value is a subclass of
@@ -1350,56 +1350,56 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.sort,
-"""
+           """
 sort(dim=None, descending=False) -> (Tensor, LongTensor)
 
 See :func:`torch.sort`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sqrt,
-"""
+           """
 sqrt() -> Tensor
 
 See :func:`torch.sqrt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sqrt_,
-"""
+           """
 sqrt_() -> Tensor
 
 In-place version of :meth:`~Tensor.sqrt`
 """)
 
 add_docstr(torch._C.FloatTensorBase.squeeze,
-"""
+           """
 squeeze(dim=None)
 
 See :func:`torch.squeeze`
 """)
 
 add_docstr(torch._C.FloatTensorBase.squeeze_,
-"""
+           """
 squeeze_(dim=None)
 
 In-place version of :meth:`~Tensor.squeeze`
 """)
 
 add_docstr(torch._C.FloatTensorBase.std,
-"""
+           """
 std() -> float
 
 See :func:`torch.std`
 """)
 
 add_docstr(torch._C.FloatTensorBase.storage,
-"""
+           """
 storage() -> torch.Storage
 
 Returns the underlying storage
 """)
 
 add_docstr(torch._C.FloatTensorBase.storage_offset,
-"""
+           """
 storage_offset() -> int
 
 Returns this tensor's offset in the underlying storage in terms of number of
@@ -1414,14 +1414,14 @@ Example:
 """)
 
 add_docstr(torch._C.FloatTensorBase.stride,
-"""
+           """
 stride() -> tuple
 
 Returns the stride of the tensor.
 """)
 
 add_docstr(torch._C.FloatTensorBase.sub,
-"""
+           """
 sub(value, other) -> Tensor
 
 Subtracts a scalar or tensor from this tensor. If both :attr:`value` and
@@ -1430,154 +1430,154 @@ Subtracts a scalar or tensor from this tensor. If both :attr:`value` and
 """)
 
 add_docstr(torch._C.FloatTensorBase.sub_,
-"""
+           """
 sub_(x) -> Tensor
 
 In-place version of :meth:`~Tensor.sub`
 """)
 
 add_docstr(torch._C.FloatTensorBase.sum,
-"""
+           """
 sum(dim=None) -> float
 
 See :func:`torch.sum`
 """)
 
 add_docstr(torch._C.FloatTensorBase.svd,
-"""
+           """
 svd(some=True) -> (Tensor, Tensor, Tensor)
 
 See :func:`torch.svd`
 """)
 
 add_docstr(torch._C.FloatTensorBase.symeig,
-"""
+           """
 symeig(eigenvectors=False, upper=True) -> (Tensor, Tensor)
 
 See :func:`torch.symeig`
 """)
 
 add_docstr(torch._C.FloatTensorBase.t,
-"""
+           """
 t() -> Tensor
 
 See :func:`torch.t`
 """)
 
 add_docstr(torch._C.FloatTensorBase.t_,
-"""
+           """
 t_() -> Tensor
 
 In-place version of :meth:`~Tensor.t`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tan,
-"""
+           """
 tan() -> Tensor
 
 See :func:`torch.tan`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tan_,
-"""
+           """
 tan_() -> Tensor
 
 In-place version of :meth:`~Tensor.tan`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tanh,
-"""
+           """
 tanh() -> Tensor
 
 See :func:`torch.tanh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tanh_,
-"""
+           """
 tanh_() -> Tensor
 
 In-place version of :meth:`~Tensor.tanh`
 """)
 
 add_docstr(torch._C.FloatTensorBase.topk,
-"""
+           """
 topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
 
 See :func:`torch.topk`
 """)
 
 add_docstr(torch._C.FloatTensorBase.trace,
-"""
+           """
 trace() -> float
 
 See :func:`torch.trace`
 """)
 
 add_docstr(torch._C.FloatTensorBase.transpose,
-"""
+           """
 transpose(dim0, dim1) -> Tensor
 
 See :func:`torch.transpose`
 """)
 
 add_docstr(torch._C.FloatTensorBase.transpose_,
-"""
+           """
 transpose_(dim0, dim1) -> Tensor
 
 In-place version of :meth:`~Tensor.transpose`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tril,
-"""
+           """
 tril(k=0) -> Tensor
 
 See :func:`torch.tril`
 """)
 
 add_docstr(torch._C.FloatTensorBase.tril_,
-"""
+           """
 tril_(k=0) -> Tensor
 
 In-place version of :meth:`~Tensor.tril`
 """)
 
 add_docstr(torch._C.FloatTensorBase.triu,
-"""
+           """
 triu(k=0) -> Tensor
 
 See :func:`torch.triu`
 """)
 
 add_docstr(torch._C.FloatTensorBase.triu_,
-"""
+           """
 triu_(k=0) -> Tensor
 
 In-place version of :meth:`~Tensor.triu`
 """)
 
 add_docstr(torch._C.FloatTensorBase.trtrs,
-"""
+           """
 trtrs(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
 
 See :func:`torch.trtrs`
 """)
 
 add_docstr(torch._C.FloatTensorBase.trunc,
-"""
+           """
 trunc() -> Tensor
 
 See :func:`torch.trunc`
 """)
 
 add_docstr(torch._C.FloatTensorBase.trunc_,
-"""
+           """
 trunc_() -> Tensor
 
 In-place version of :meth:`~Tensor.trunc`
 """)
 
 add_docstr(torch._C.FloatTensorBase.unfold,
-"""
+           """
 unfold(dim, size, step) -> Tensor
 
 Returns a tensor which contains all slices of size :attr:`size` in
@@ -1629,7 +1629,7 @@ Example::
 """)
 
 add_docstr(torch._C.FloatTensorBase.uniform_,
-"""
+           """
 uniform_(from=0, to=1) -> Tensor
 
 Fills this tensor with numbers sampled from the uniform distribution:
@@ -1640,14 +1640,14 @@ Fills this tensor with numbers sampled from the uniform distribution:
 """)
 
 add_docstr(torch._C.FloatTensorBase.var,
-"""
+           """
 var() -> float
 
 See :func:`torch.var`
 """)
 
 add_docstr(torch._C.FloatTensorBase.zero_,
-"""
+           """
 zero_()
 
 Fills this tensor with zeros.
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 3909f0989c..1ccf46e5b3 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -22,7 +22,7 @@ def set_printoptions(
         edgeitems=None,
         linewidth=None,
         profile=None,
-        ):
+):
     """Set options for printing. Items shamelessly taken from Numpy
 
     Args:
@@ -119,7 +119,7 @@ def _number_format(tensor, min_sz=-1):
         else:
             if exp_max > prec + 1 or exp_max < 0:
                 sz = max(min_sz, 7)
-                scale = math.pow(10, exp_max-1)
+                scale = math.pow(10, exp_max - 1)
             else:
                 if exp_max == 0:
                     sz = 7
@@ -132,19 +132,19 @@ def _number_format(tensor, min_sz=-1):
 
 def _tensor_str(self):
     n = PRINT_OPTS.edgeitems
-    has_hdots = self.size()[-1] > 2*n
-    has_vdots = self.size()[-2] > 2*n
+    has_hdots = self.size()[-1] > 2 * n
+    has_vdots = self.size()[-2] > 2 * n
     print_full_mat = not has_hdots and not has_vdots
     formatter = _number_format(self, min_sz=3 if not print_full_mat else 0)
     print_dots = self.numel() >= PRINT_OPTS.threshold
 
     dim_sz = max(2, max(len(str(x)) for x in self.size()))
     dim_fmt = "{:^" + str(dim_sz) + "}"
-    dot_fmt = u"{:^" + str(dim_sz+1) + "}"
+    dot_fmt = u"{:^" + str(dim_sz + 1) + "}"
 
     counter_dim = self.ndimension() - 2
     counter = torch.LongStorage(counter_dim).fill_(0)
-    counter[counter.size()-1] = -1
+    counter[counter.size() - 1] = -1
     finished = False
     strt = ''
     while True:
@@ -152,7 +152,7 @@ def _tensor_str(self):
         nskipped = [False for i in counter]
         for i in _range(counter_dim - 1, -1, -1):
             counter[i] += 1
-            if print_dots and counter[i] == n and self.size(i) > 2*n:
+            if print_dots and counter[i] == n and self.size(i) > 2 * n:
                 counter[i] = self.size(i) - n
                 nskipped[i] = True
             if counter[i] == self.size(i):
@@ -188,18 +188,18 @@ def __repr_row(row, indent, fmt, scale, sz, truncate=None):
     if truncate is not None:
         dotfmt = " {:^5} "
         return (indent +
-                ' '.join(fmt.format(val/scale) for val in row[:truncate]) +
+                ' '.join(fmt.format(val / scale) for val in row[:truncate]) +
                 dotfmt.format('...') +
-                ' '.join(fmt.format(val/scale) for val in row[-truncate:]) +
+                ' '.join(fmt.format(val / scale) for val in row[-truncate:]) +
                 '\n')
     else:
-        return indent + ' '.join(fmt.format(val/scale) for val in row) + '\n'
+        return indent + ' '.join(fmt.format(val / scale) for val in row) + '\n'
 
 
 def _matrix_str(self, indent='', formatter=None, force_truncate=False):
     n = PRINT_OPTS.edgeitems
-    has_hdots = self.size(1) > 2*n
-    has_vdots = self.size(0) > 2*n
+    has_hdots = self.size(1) > 2 * n
+    has_vdots = self.size(0) > 2 * n
     print_full_mat = not has_hdots and not has_vdots
 
     if formatter is None:
@@ -207,14 +207,14 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                                         min_sz=5 if not print_full_mat else 0)
     else:
         fmt, scale, sz = formatter
-    nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth-len(indent))/(sz+1)))
+    nColumnPerLine = int(math.floor((PRINT_OPTS.linewidth - len(indent)) / (sz + 1)))
     strt = ''
     firstColumn = 0
 
     if not force_truncate and \
        (self.numel() < PRINT_OPTS.threshold or print_full_mat):
         while firstColumn < self.size(1):
-            lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1)-1)
+            lastColumn = min(firstColumn + nColumnPerLine - 1, self.size(1) - 1)
             if nColumnPerLine < self.size(1):
                 strt += '\n' if firstColumn != 1 else ''
                 strt += 'Columns {} to {} \n{}'.format(
@@ -223,15 +223,15 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                 strt += SCALE_FORMAT.format(scale)
             for l in _range(self.size(0)):
                 strt += indent + (' ' if scale != 1 else '')
-                row_slice = self[l, firstColumn:lastColumn+1]
-                strt += ' '.join(fmt.format(val/scale) for val in row_slice)
+                row_slice = self[l, firstColumn:lastColumn + 1]
+                strt += ' '.join(fmt.format(val / scale) for val in row_slice)
                 strt += '\n'
             firstColumn = lastColumn + 1
     else:
         if scale != 1:
             strt += SCALE_FORMAT.format(scale)
         if has_vdots and has_hdots:
-            vdotfmt = "{:^" + str((sz+1)*n-1) + "}"
+            vdotfmt = "{:^" + str((sz + 1) * n - 1) + "}"
             ddotfmt = u"{:^5}"
             for row in self[:n]:
                 strt += __repr_row(row, indent, fmt, scale, sz, n)
@@ -245,8 +245,8 @@ def _matrix_str(self, indent='', formatter=None, force_truncate=False):
                 strt += __repr_row(row, indent, fmt, scale, sz, n)
         elif has_vdots and not has_hdots:
             vdotfmt = u"{:^" + \
-                    str(len(__repr_row(self[0], '', fmt, scale, sz))) + \
-                    "}\n"
+                str(len(__repr_row(self[0], '', fmt, scale, sz))) + \
+                "}\n"
             for row in self[:n]:
                 strt += __repr_row(row, indent, fmt, scale, sz)
             strt += vdotfmt.format(u'\u22EE')
@@ -269,13 +269,13 @@ def _vector_str(self):
         ident = ' '
     if self.numel() < PRINT_OPTS.threshold:
         return (strt +
-                '\n'.join(ident + fmt.format(val/scale) for val in self) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self) +
                 '\n')
     else:
         return (strt +
-                '\n'.join(ident + fmt.format(val/scale) for val in self[:n]) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self[:n]) +
                 '\n' + (ident + dotfmt.format(u"\u22EE")) +
-                '\n'.join(ident + fmt.format(val/scale) for val in self[-n:]) +
+                '\n'.join(ident + fmt.format(val / scale) for val in self[-n:]) +
                 '\n')
 
 
@@ -295,4 +295,3 @@ def _str(self):
     strt += '[{} of size {}{}]\n'.format(torch.typename(self),
                                          size_str, device_str)
     return '\n' + strt
-
diff --git a/torch/_thnn/__init__.py b/torch/_thnn/__init__.py
index 97474692eb..dd41e47a27 100644
--- a/torch/_thnn/__init__.py
+++ b/torch/_thnn/__init__.py
@@ -2,7 +2,9 @@ import threading
 import torch.cuda
 from .utils import THNN_H_PATH, THCUNN_H_PATH, parse_header, load_backend
 
+
 class Backends(object):
+
     def __init__(self):
         self.backends = {}
 
@@ -14,6 +16,7 @@ class Backends(object):
 
 
 class Backend(object):
+
     def __init__(self, lib_prefix, lib_name, functions, mixins=tuple()):
         self.lib_prefix = lib_prefix
         self.lib_name = lib_name
@@ -32,11 +35,12 @@ class Backend(object):
             with self.loading_lock:
                 if self.backend is None:
                     self.backend = load_backend(self.lib_prefix, self.lib_name,
-                            self.functions, self.mixins)
+                                                self.functions, self.mixins)
         return self.backend
 
 
 class THNNCudaBackendStateMixin(object):
+
     @property
     def library_state(self):
         return torch.cuda._state_cdata
diff --git a/torch/_thnn/utils.py b/torch/_thnn/utils.py
index c62fc2a29e..66d527a704 100644
--- a/torch/_thnn/utils.py
+++ b/torch/_thnn/utils.py
@@ -12,6 +12,7 @@ def _unpickle_backend(backend_name):
 
 
 class THNNBackendBase(object):
+
     def __init__(self):
         self.methods = {}
 
@@ -33,6 +34,7 @@ class THNNBackendBase(object):
 
 
 class Function(object):
+
     def __init__(self, name):
         self.name = name
         self.arguments = []
@@ -46,6 +48,7 @@ class Function(object):
 
 
 class Argument(object):
+
     def __init__(self, _type, name, is_optional):
         self.type = _type
         self.name = name
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ed46cc8ab8..8641c188a9 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4,7 +4,7 @@ import torch._C
 from torch._C import _add_docstr as add_docstr
 
 add_docstr(torch._C.abs,
-"""abs(input, out=None) -> Tensor
+           """abs(input, out=None) -> Tensor
 
 Computes the element-wise absolute value of the given :attr:`input` a tensor.
 
@@ -15,7 +15,7 @@ Example::
 """)
 
 add_docstr(torch._C.acos,
-"""
+           """
 acos(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the arccosine  of the elements of :attr:`input`.
@@ -44,7 +44,7 @@ Example::
 """)
 
 add_docstr(torch._C.add,
-"""
+           """
 .. function:: add(input, value, out=None)
 
 Adds the scalar :attr:`value` to each element of the input :attr:`input`
@@ -127,7 +127,7 @@ Example::
 """)
 
 add_docstr(torch._C.addbmm,
-"""
+           """
 addbmm(beta=1, mat, alpha=1, batch1, batch2, out=None) -> Tensor
 
 Performs a batch matrix-matrix product of matrices stored
@@ -167,7 +167,7 @@ Example::
 """)
 
 add_docstr(torch._C.addcdiv,
-"""
+           """
 addcdiv(tensor, value=1, tensor1, tensor2, out=None) -> Tensor
 
 Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
@@ -195,7 +195,7 @@ Example::
 """)
 
 add_docstr(torch._C.addcmul,
-"""
+           """
 addcmul(tensor, value=1, tensor1, tensor2, out=None) -> Tensor
 
 Performs the element-wise multiplication of :attr:`tensor1`
@@ -224,7 +224,7 @@ Example::
 """)
 
 add_docstr(torch._C.addmm,
-"""
+           """
 addmm(beta=1, mat, alpha=1, mat1, mat2, out=None) -> Tensor
 
 Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
@@ -259,7 +259,7 @@ Example::
 """)
 
 add_docstr(torch._C.addmv,
-"""
+           """
 addmv(beta=1, tensor, alpha=1, mat, vec, out=None) -> Tensor
 
 Performs a matrix-vector product of the matrix :attr:`mat` and
@@ -296,7 +296,7 @@ Example::
 """)
 
 add_docstr(torch._C.addr,
-r"""
+           r"""
 addr(beta=1, mat, alpha=1, vec1, vec2, out=None) -> Tensor
 
 Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
@@ -332,7 +332,7 @@ Example::
 """)
 
 add_docstr(torch._C.asin,
-"""
+           """
 asin(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the arcsine  of the elements of :attr:`input`.
@@ -360,7 +360,7 @@ Example::
 """)
 
 add_docstr(torch._C.atan,
-"""
+           """
 atan(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the arctangent  of the elements of :attr:`input`.
@@ -388,7 +388,7 @@ Example::
 """)
 
 add_docstr(torch._C.atan2,
-"""
+           """
 atan2(input1, input2, out=None) -> Tensor
 
 Returns a new `Tensor` with the arctangent of the elements of :attr:`input1`
@@ -418,7 +418,7 @@ Example::
 """)
 
 add_docstr(torch._C.baddbmm,
-r"""
+           r"""
 baddbmm(beta=1, mat, alpha=1, batch1, batch2, out=None) -> Tensor
 
 Performs a batch matrix-matrix product of matrices in :attr:`batch1`
@@ -452,7 +452,7 @@ Example::
 """)
 
 add_docstr(torch._C.bernoulli,
-"""
+           """
 bernoulli(input, out=None) -> Tensor
 
 Draws binary random numbers (0 or 1) from a bernoulli distribution.
@@ -508,7 +508,7 @@ Example::
 """)
 
 add_docstr(torch._C.bmm,
-"""
+           """
 bmm(batch1, batch2, out=None) -> Tensor
 
 Performs a batch matrix-matrix product of matrices stored in :attr:`batch1` and :attr:`batch2`.
@@ -533,7 +533,7 @@ Example::
 """)
 
 add_docstr(torch._C.cat,
-"""
+           """
 cat(inputs, dimension=0) -> Tensor
 
 Concatenates the given sequence of :attr:`inputs` Tensors in the given dimension.
@@ -574,7 +574,7 @@ Example::
 """)
 
 add_docstr(torch._C.ceil,
-"""
+           """
 ceil(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the ceil of the elements of :attr:`input`, the smallest integer greater than or equal to each element.
@@ -605,7 +605,7 @@ Example::
 """)
 
 add_docstr(torch._C.reciprocal,
-"""
+           """
 reciprocal(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the reciprocal of the elements of :attr:`input`, i.e. :math:`1.0 / x`
@@ -636,7 +636,7 @@ Example::
 """)
 
 add_docstr(torch._C.clamp,
-"""
+           """
 clamp(input, min, max, out=None) -> Tensor
 
 Clamp all elements in :attr:`input` into the range `[min, max]` and return a resulting Tensor.
@@ -731,7 +731,7 @@ Example::
 """)
 
 add_docstr(torch._C.cos,
-"""
+           """
 cos(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the cosine  of the elements of :attr:`input`.
@@ -759,7 +759,7 @@ Example::
 """)
 
 add_docstr(torch._C.cosh,
-"""
+           """
 cosh(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the hyperbolic cosine  of the elements of :attr:`input`.
@@ -787,7 +787,7 @@ Example::
 """)
 
 add_docstr(torch._C.cross,
-"""
+           """
 cross(input, other, dim=-1, out=None) -> Tensor
 
 
@@ -841,7 +841,7 @@ Example::
 """)
 
 add_docstr(torch._C.cumprod,
-"""
+           """
 cumprod(input, dim, out=None) -> Tensor
 
 Returns the cumulative product of elements of :attr:`input` in the dimension :attr:`dim`.
@@ -903,7 +903,7 @@ Example::
 """)
 
 add_docstr(torch._C.cumsum,
-"""
+           """
 cumsum(input, dim, out=None) -> Tensor
 
 Returns the cumulative sum of elements of :attr:`input` in the dimension :attr:`dim`.
@@ -951,7 +951,7 @@ Example::
 """)
 
 add_docstr(torch._C.diag,
-"""
+           """
 diag(input, diagonal=0, out=None) -> Tensor
 
 - If :attr:`input` is a vector (1D Tensor), then returns a 2D square Tensor with the elements of :attr:`input` as the diagonal.
@@ -1022,7 +1022,7 @@ Get the k-th diagonal of a given matrix::
 """)
 
 add_docstr(torch._C.dist,
-"""
+           """
 dist(input, other, p=2, out=None) -> Tensor
 
 Returns the p-norm of (:attr:`input` - :attr:`other`)
@@ -1066,7 +1066,7 @@ Example::
 """)
 
 add_docstr(torch._C.div,
-"""
+           """
 .. function:: div(input, value, out=None)
 
 Divides each element of the input :attr:`input` with the scalar :attr:`value` and returns a new resulting tensor.
@@ -1150,7 +1150,7 @@ Example::
 """)
 
 add_docstr(torch._C.dot,
-"""
+           """
 dot(tensor1, tensor2) -> float
 
 Computes the dot product (inner product) of two tensors. Both tensors are
@@ -1163,7 +1163,7 @@ Example::
 """)
 
 add_docstr(torch._C.eig,
-"""
+           """
 eig(a, eigenvectors=False, out=None) -> (Tensor, Tensor)
 
 Computes the eigenvalues and eigenvectors of a real square matrix.
@@ -1183,7 +1183,7 @@ Returns:
 """)
 
 add_docstr(torch._C.eq,
-"""
+           """
 eq(input, other, out=None) -> Tensor
 
 Computes element-wise equality
@@ -1208,7 +1208,7 @@ Example::
 """)
 
 add_docstr(torch._C.equal,
-"""
+           """
 equal(tensor1, tensor2) -> bool
 
 True if two tensors have the same size and elements, False otherwise.
@@ -1220,7 +1220,7 @@ Example::
 """)
 
 add_docstr(torch._C.exp,
-"""
+           """
 exp(tensor, out=None) -> Tensor
 
 Computes the exponential of each element.
@@ -1232,7 +1232,7 @@ Example::
 """)
 
 add_docstr(torch._C.eye,
-"""
+           """
 eye(n, m=None, out=None)
 
 Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
@@ -1255,7 +1255,7 @@ Example::
 """)
 
 add_docstr(torch._C.floor,
-"""
+           """
 floor(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the floor of the elements of :attr:`input`, the largest integer less than or equal to each element.
@@ -1287,7 +1287,7 @@ Example::
 """)
 
 add_docstr(torch._C.fmod,
-"""
+           """
 fmod(input, divisor, out=None) -> Tensor
 
 Computes the element-wise remainder of division.
@@ -1315,7 +1315,7 @@ Example::
 """)
 
 add_docstr(torch._C.frac,
-"""
+           """
 frac(tensor, out=None) -> Tensor
 
 Computes the fractional portion of each element in `tensor`.
@@ -1327,7 +1327,7 @@ Example::
 """)
 
 add_docstr(torch._C.from_numpy,
-"""
+           """
 from_numpy(ndarray) -> Tensor
 
 Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
@@ -1348,7 +1348,7 @@ Example::
 """)
 
 add_docstr(torch._C.gather,
-"""
+           """
 gather(input, dim, index, out=None) -> Tensor
 
 Gathers values along an axis specified by `dim`.
@@ -1375,7 +1375,7 @@ Example::
 """)
 
 add_docstr(torch._C.ge,
-"""
+           """
 ge(input, other, out=None) -> Tensor
 
 Computes `tensor >= other` element-wise.
@@ -1400,7 +1400,7 @@ Example::
 """)
 
 add_docstr(torch._C.gels,
-r"""
+           r"""
 gels(B, A, out=None) -> Tensor
 
 Computes the solution to the least squares and least norm problems for a full
@@ -1466,7 +1466,7 @@ Example::
 """)
 
 add_docstr(torch._C.geqrf,
-r"""
+           r"""
 geqrf(input, out=None) -> (Tensor, Tensor)
 
 This is a low-level function for calling LAPACK directly. 
@@ -1489,7 +1489,7 @@ Args:
 """)
 
 add_docstr(torch._C.ger,
-"""
+           """
 ger(vec1, vec2, out=None) -> Tensor
 Outer product of :attr:`vec1` and :attr:`vec2`. If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector of size `m`, then :attr:`out` must be a matrix of size `n x m`.
 
@@ -1513,7 +1513,7 @@ Example::
 """)
 
 add_docstr(torch._C.gesv,
-"""
+           """
 gesv(B, A, out=None) -> (Tensor, Tensor)
 
 `X, LU = torch.gesv(B, A)` returns the solution to the system of linear
@@ -1552,14 +1552,14 @@ Example::
 """)
 
 add_docstr(torch._C.get_num_threads,
-"""
+           """
 get_num_threads() -> int
 
 Gets the number of OpenMP threads used for parallelizing CPU operations
 """)
 
 add_docstr(torch._C.gt,
-"""
+           """
 gt(input, other, out=None) -> Tensor
 
 Computes `tensor > other` element-wise.
@@ -1584,7 +1584,7 @@ Example::
 """)
 
 add_docstr(torch._C.histc,
-"""
+           """
 histc(input, bins=100, min=0, max=0, out=None) -> Tensor
 
 Computes the histogram of a tensor.
@@ -1610,7 +1610,7 @@ Example::
 """)
 
 add_docstr(torch._C.index_select,
-"""
+           """
 index_select(input, dim, index, out=None) -> Tensor
 
 Returns a new `Tensor` which indexes the :attr:`input` `Tensor` along dimension :attr:`dim`
@@ -1653,7 +1653,7 @@ Example::
 """)
 
 add_docstr(torch._C.inverse,
-"""
+           """
 inverse(input, out=None) -> Tensor
 
 Takes the inverse of the square matrix :attr:`input`.
@@ -1704,7 +1704,7 @@ Example::
 """)
 
 add_docstr(torch._C.kthvalue,
-"""
+           """
 kthvalue(input, k, dim=None, out=None) -> (Tensor, LongTensor)
 
 Returns the :attr:`k`th smallest element of the given :attr:`input` Tensor along a given dimension.
@@ -1745,7 +1745,7 @@ Example::
 """)
 
 add_docstr(torch._C.le,
-"""
+           """
 le(input, other, out=None) -> Tensor
 
 Computes `tensor <= other` element-wise.
@@ -1770,7 +1770,7 @@ Example::
 """)
 
 add_docstr(torch._C.lerp,
-"""
+           """
 lerp(start, end, weight, out=None)
 
 Does a linear interpolation of two tensors :attr:`start` and :attr:`end` based on a scalar :attr:`weight`: and returns the resulting :attr:`out` Tensor.
@@ -1814,7 +1814,7 @@ Example::
 """)
 
 add_docstr(torch._C.linspace,
-"""
+           """
 linspace(start, end, steps=100, out=None) -> Tensor
 
 Returns a one-dimensional Tensor of :attr:`steps`
@@ -1860,7 +1860,7 @@ Example::
 """)
 
 add_docstr(torch._C.log,
-"""
+           """
 log(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the natural logarithm of the elements of :attr:`input`.
@@ -1893,7 +1893,7 @@ Example::
 """)
 
 add_docstr(torch._C.log1p,
-"""
+           """
 log1p(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the natural logarithm of (1 + :attr:`input`).
@@ -1930,7 +1930,7 @@ Example::
 """)
 
 add_docstr(torch._C.logspace,
-"""
+           """
 logspace(start, end, steps=100, out=None) -> Tensor
 
 Returns a one-dimensional Tensor of :attr:`steps` points
@@ -1967,7 +1967,7 @@ Example::
 """)
 
 add_docstr(torch._C.lt,
-"""
+           """
 lt(input, other, out=None) -> Tensor
 
 Computes `tensor < other` element-wise.
@@ -1992,7 +1992,7 @@ Example::
 """)
 
 add_docstr(torch._C.masked_select,
-"""
+           """
 masked_select(input, mask, out=None) -> Tensor
 
 Returns a new 1D `Tensor` which indexes the :attr:`input` `Tensor` according to the binary mask :attr:`mask` which is a `ByteTensor`.
@@ -2038,7 +2038,7 @@ Example::
 """)
 
 add_docstr(torch._C.max,
-"""
+           """
 .. function:: max(input) -> float
 
 Returns the maximum value of all elements in the :attr:`input` Tensor.
@@ -2144,7 +2144,7 @@ Example::
 """)
 
 add_docstr(torch._C.mean,
-"""
+           """
 .. function:: mean(input) -> float
 
 Returns the mean value of all elements in the :attr:`input` Tensor.
@@ -2197,7 +2197,7 @@ Example::
 """)
 
 add_docstr(torch._C.median,
-"""
+           """
 median(input, dim=-1, values=None, indices=None) -> (Tensor, LongTensor)
 
 Returns the median value of each row of the :attr:`input` Tensor in the given dimension :attr:`dim`.
@@ -2252,7 +2252,7 @@ Example::
 """)
 
 add_docstr(torch._C.min,
-"""
+           """
 .. function:: min(input) -> float
 
 Returns the minimum value of all elements in the :attr:`input` Tensor.
@@ -2357,7 +2357,7 @@ Example::
 """)
 
 add_docstr(torch._C.mm,
-"""
+           """
 mm(mat1, mat2, out=None) -> Tensor
 
 Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
@@ -2380,7 +2380,7 @@ Example::
 """)
 
 add_docstr(torch._C.mode,
-"""
+           """
 mode(input, dim=-1, values=None, indices=None) -> (Tensor, LongTensor)
 
 Returns the mode value of each row of the :attr:`input` Tensor in the given dimension :attr:`dim`.
@@ -2435,7 +2435,7 @@ Example::
 """)
 
 add_docstr(torch._C.mul,
-"""
+           """
 .. function:: mul(input, value, out=None)
 
 Multiplies each element of the input :attr:`input` with the scalar :attr:`value` and returns a new resulting tensor.
@@ -2508,7 +2508,7 @@ Example::
 """)
 
 add_docstr(torch._C.multinomial,
-u"""
+           u"""
 multinomial(input, num_samples, replacement=False, out=None) -> LongTensor
 
 Returns a Tensor where each row
@@ -2562,7 +2562,7 @@ Example::
 """)
 
 add_docstr(torch._C.mv,
-"""
+           """
 mv(mat, vec, out=None) -> Tensor
 
 Performs a matrix-vector product of the matrix :attr:`mat` and the vector :attr:`vec`.
@@ -2585,7 +2585,7 @@ Example::
 """)
 
 add_docstr(torch._C.ne,
-"""
+           """
 ne(input, other, out=None) -> Tensor
 
 Computes `tensor != other` element-wise.
@@ -2610,7 +2610,7 @@ Example::
 """)
 
 add_docstr(torch._C.neg,
-"""
+           """
 neg(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the negative of the elements of :attr:`input`.
@@ -2645,7 +2645,7 @@ Example::
 """)
 
 add_docstr(torch._C.nonzero,
-"""
+           """
 nonzero(input, out=None) -> LongTensor
 
 Returns a tensor containing the indices of all non-zero elements of :attr:`input`.
@@ -2681,7 +2681,7 @@ Example::
 """)
 
 add_docstr(torch._C.norm,
-"""
+           """
 .. function:: norm(input, p=2) -> float
 
 Returns the p-norm of the :attr:`input` Tensor.
@@ -2743,7 +2743,7 @@ Example::
 """)
 
 add_docstr(torch._C.normal,
-"""
+           """
 .. function:: normal(means, stddevs, out=None)
 
 Returns a Tensor of random numbers drawn from separate normal distributions
@@ -2825,7 +2825,7 @@ Example::
 """)
 
 add_docstr(torch._C.numel,
-"""
+           """
 numel(input) -> int
 
 Returns the total number of elements in the :attr:`input` Tensor.
@@ -2845,7 +2845,7 @@ Example::
 """)
 
 add_docstr(torch._C.ones,
-"""
+           """
 ones(*sizes, out=None) -> Tensor
 
 Returns a Tensor filled with the scalar value `1`, with the shape defined
@@ -2896,7 +2896,7 @@ Example::
 # """)
 
 add_docstr(torch._C.pow,
-"""
+           """
 .. function:: pow(input, exponent, out=None)
 
 Takes the power of each element in :attr:`input` with :attr:`exponent` and returns a Tensor with the result.
@@ -2991,7 +2991,7 @@ Example::
 """)
 
 add_docstr(torch._C.prod,
-"""
+           """
 .. function:: prod(input) -> float
 
 Returns the product of all elements in the :attr:`input` Tensor.
@@ -3049,7 +3049,7 @@ Example::
 # """)
 
 add_docstr(torch._C.qr,
-"""
+           """
 qr(input, out=None) -> (Tensor, Tensor)
 
 Computes the QR decomposition of a matrix :attr:`input`: returns matrices 
@@ -3106,7 +3106,7 @@ Example::
 """)
 
 add_docstr(torch._C.rand,
-"""
+           """
 rand(*sizes, out=None) -> Tensor
 
 Returns a Tensor filled with random numbers from a uniform distribution
@@ -3137,7 +3137,7 @@ Example::
 """)
 
 add_docstr(torch._C.randn,
-"""
+           """
 randn(*sizes, out=None) -> Tensor
 
 Returns a Tensor filled with random numbers from a normal distribution
@@ -3168,7 +3168,7 @@ Example::
 """)
 
 add_docstr(torch._C.randperm,
-"""
+           """
 randperm(n, out=None) -> LongTensor
 
 Returns a random permutation of integers from ``0`` to ``n - 1``.
@@ -3188,7 +3188,7 @@ Example::
 """)
 
 add_docstr(torch._C.range,
-"""
+           """
 range(start, end, step=1, out=None) -> Tensor
 
 returns a 1D Tensor of size :math:`floor((end - start) / step) + 1` with values
@@ -3225,7 +3225,7 @@ Example::
 """)
 
 add_docstr(torch._C.remainder,
-"""
+           """
 remainder(input, divisor, out=None) -> Tensor
 
 Computes the element-wise remainder of division.
@@ -3253,7 +3253,7 @@ Example::
 """)
 
 add_docstr(torch._C.renorm,
-"""
+           """
 renorm(input, p, dim, maxnorm, out=None) -> Tensor
 
 Returns a Tensor where each sub-tensor of :attr:`input` along dimension :attr:`dim`
@@ -3290,7 +3290,7 @@ Example::
 """)
 
 add_docstr(torch._C.round,
-"""
+           """
 round(input, out=None) -> Tensor
 
 Returns a new `Tensor` with each of the elements of :attr:`input` rounded to the closest integer.
@@ -3321,7 +3321,7 @@ Example::
 """)
 
 add_docstr(torch._C.rsqrt,
-"""
+           """
 rsqrt(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the reciprocal of the square-root of each of the elements of :attr:`input`.
@@ -3352,14 +3352,14 @@ Example::
 """)
 
 add_docstr(torch._C.set_num_threads,
-"""
+           """
 set_num_threads(int)
 
 Sets the number of OpenMP threads used for parallelizing CPU operations
 """)
 
 add_docstr(torch._C.sigmoid,
-"""
+           """
 sigmoid(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the sigmoid of the elements of :attr:`input`.
@@ -3390,7 +3390,7 @@ Example::
 """)
 
 add_docstr(torch._C.sign,
-"""
+           """
 sign(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the sign of the elements of :attr:`input`.
@@ -3420,7 +3420,7 @@ Example::
 """)
 
 add_docstr(torch._C.sin,
-"""
+           """
 sin(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the sine of the elements of :attr:`input`.
@@ -3448,7 +3448,7 @@ Example::
 """)
 
 add_docstr(torch._C.sinh,
-"""
+           """
 sinh(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the hyperbolic sine of the elements of :attr:`input`.
@@ -3476,7 +3476,7 @@ Example::
 """)
 
 add_docstr(torch._C.sort,
-"""
+           """
 sort(input, dim=None, descending=False, out=None) -> (Tensor, LongTensor)
 
 Sorts the elements of the :attr:`input` Tensor along a given dimension in ascending order by value.
@@ -3530,7 +3530,7 @@ Example::
 """)
 
 add_docstr(torch._C.sqrt,
-"""
+           """
 sqrt(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the square-root of the elements of :attr:`input`.
@@ -3561,7 +3561,7 @@ Example::
 """)
 
 add_docstr(torch._C.squeeze,
-"""
+           """
 squeeze(input, dim=None, out=None)
 
 Returns a `Tensor` with all the dimensions of :attr:`input` of size `1` removed.
@@ -3599,7 +3599,7 @@ Example::
 """)
 
 add_docstr(torch._C.std,
-"""
+           """
 .. function:: std(input) -> float
 
 Returns the standard-deviation of all elements in the :attr:`input` Tensor.
@@ -3652,7 +3652,7 @@ Example::
 """)
 
 add_docstr(torch._C.sum,
-"""
+           """
 .. function:: sum(input) -> float
 
 Returns the sum of all elements in the :attr:`input` Tensor.
@@ -3705,7 +3705,7 @@ Example::
 """)
 
 add_docstr(torch._C.svd,
-"""
+           """
 svd(input, some=True, out=None) -> (Tensor, Tensor, Tensor)
 
 `U, S, V = torch.svd(A)` returns the singular value decomposition of a 
@@ -3780,7 +3780,7 @@ Example::
 """)
 
 add_docstr(torch._C.symeig,
-"""
+           """
 symeig(input, eigenvectors=False, upper=True, out=None) -> (Tensor, Tensor)
 
 `e, V = torch.symeig(input)` returns eigenvalues and eigenvectors 
@@ -3842,7 +3842,7 @@ Examples::
 """)
 
 add_docstr(torch._C.t,
-"""
+           """
 t(input, out=None) -> Tensor
 
 Expects :attr:`input` to be a matrix (2D Tensor) and transposes dimensions 0 and 1.
@@ -3872,7 +3872,7 @@ Example::
 """)
 
 add_docstr(torch._C.tan,
-"""
+           """
 tan(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the tangent of the elements of :attr:`input`.
@@ -3900,7 +3900,7 @@ Example::
 """)
 
 add_docstr(torch._C.tanh,
-"""
+           """
 tanh(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the hyperbolic tangent of the elements of :attr:`input`.
@@ -3928,7 +3928,7 @@ Example::
 """)
 
 add_docstr(torch._C.topk,
-"""
+           """
 topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor)
 
 Returns the :attr:`k` largest elements of the given :attr:`input` Tensor along a given dimension.
@@ -3992,7 +3992,7 @@ Example::
 """)
 
 add_docstr(torch._C.trace,
-"""
+           """
 trace(input) -> float
 
 Returns the sum of the elements of the diagonal of the input 2D matrix.
@@ -4013,7 +4013,7 @@ Example::
 """)
 
 add_docstr(torch._C.transpose,
-"""
+           """
 transpose(input, dim0, dim1, out=None) -> Tensor
 
 Returns a `Tensor` that is a transposed version of :attr:`input`. The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
@@ -4044,7 +4044,7 @@ Example::
 """)
 
 add_docstr(torch._C.tril,
-"""
+           """
 tril(input, k=0, out=None) -> Tensor
 
 Returns the lower triangular part of the matrix (2D Tensor) :attr:`input`,
@@ -4097,7 +4097,7 @@ Example::
 """)
 
 add_docstr(torch._C.triu,
-"""
+           """
 triu(input, k=0, out=None) -> Tensor
 
 Returns the upper triangular part of the matrix (2D Tensor) :attr:`input`,
@@ -4155,7 +4155,7 @@ Example::
 # """)
 
 add_docstr(torch._C.trunc,
-"""
+           """
 trunc(input, out=None) -> Tensor
 
 Returns a new `Tensor` with the truncated integer values of the elements of :attr:`input`.
@@ -4186,7 +4186,7 @@ Example::
 """)
 
 add_docstr(torch._C.var,
-"""
+           """
 .. function:: var(input) -> float
 
 Returns the variance of all elements in the :attr:`input` Tensor.
@@ -4239,7 +4239,7 @@ Example::
 """)
 
 add_docstr(torch._C.zeros,
-"""
+           """
 zeros(*sizes, out=None) -> Tensor
 
 Returns a Tensor filled with the scalar value `0`, with the shape defined
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 6a7614dab9..7786a4ee8f 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -12,6 +12,7 @@ from .stochastic_function import StochasticFunction
 
 __all__ = ['Variable', 'Function', 'StochasticFunction', 'backward']
 
+
 def backward(variables, grad_variables, retain_variables=False):
     """Computes the sum of gradients of given variables w.r.t. graph leaves.
 
@@ -37,6 +38,6 @@ def backward(variables, grad_variables, retain_variables=False):
             times.
     """
     Variable._execution_engine.run_backward(
-            tuple(variables), tuple(grad_variables), retain_variables)
+        tuple(variables), tuple(grad_variables), retain_variables)
 
 assert torch._C._autograd_init()
diff --git a/torch/autograd/_functions/__init__.py b/torch/autograd/_functions/__init__.py
index 9f82344c28..2c07aafabd 100644
--- a/torch/autograd/_functions/__init__.py
+++ b/torch/autograd/_functions/__init__.py
@@ -5,4 +5,3 @@ from .reduce import *
 from .linalg import *
 from .blas import *
 from .stochastic import *
-
diff --git a/torch/autograd/_functions/basic_ops.py b/torch/autograd/_functions/basic_ops.py
index c8ff8bcfbd..1a405efbc5 100644
--- a/torch/autograd/_functions/basic_ops.py
+++ b/torch/autograd/_functions/basic_ops.py
@@ -59,7 +59,7 @@ class Pow(Function):
 
     def backward(self, grad_output):
         a, b = self.saved_tensors
-        return grad_output.mul(b).mul_(a.pow(b-1)), grad_output.mul(a.pow(b)).mul_(a.log())
+        return grad_output.mul(b).mul_(a.pow(b - 1)), grad_output.mul(a.pow(b)).mul_(a.log())
 
 
 class AddConstant(InplaceFunction):
@@ -174,7 +174,7 @@ class PowConstant(Function):
             return grad_output.mul(self.fw_result).mul_(math.log(self.constant))
         else:
             a = self.saved_tensors[0]
-            return grad_output.mul(self.constant).mul_(a.pow(self.constant-1))
+            return grad_output.mul(self.constant).mul_(a.pow(self.constant - 1))
 
 
 class Negate(InplaceFunction):
diff --git a/torch/autograd/_functions/blas.py b/torch/autograd/_functions/blas.py
index 0f584b6044..5738c92458 100644
--- a/torch/autograd/_functions/blas.py
+++ b/torch/autograd/_functions/blas.py
@@ -25,7 +25,7 @@ class Addmm(_BlasBase):
         self.save_for_backward(matrix1, matrix2)
         output = self._get_output(add_matrix)
         return torch.addmm(self.alpha, add_matrix, self.beta,
-                matrix1, matrix2, out=output)
+                           matrix1, matrix2, out=output)
 
     def backward(self, grad_output):
         matrix1, matrix2 = self.saved_tensors
@@ -55,7 +55,7 @@ class Addbmm(_BlasBase):
         self.save_for_backward(batch1, batch2)
         output = self._get_output(add_matrix)
         return torch.addbmm(self.alpha, add_matrix, self.beta,
-                batch1, batch2, out=output)
+                            batch1, batch2, out=output)
 
     def backward(self, grad_output):
         batch1, batch2 = self.saved_tensors
@@ -68,8 +68,8 @@ class Addbmm(_BlasBase):
 
         if any(self.needs_input_grad[1:]):
             batch_grad_output = (grad_output
-                    .unsqueeze(0)
-                    .expand(batch1.size(0), batch1.size(1), batch2.size(2)))
+                                 .unsqueeze(0)
+                                 .expand(batch1.size(0), batch1.size(1), batch2.size(2)))
 
         if self.needs_input_grad[1]:
             grad_batch1 = torch.bmm(batch_grad_output, batch2.transpose(1, 2))
@@ -90,7 +90,7 @@ class Baddbmm(_BlasBase):
         self.save_for_backward(batch1, batch2)
         output = self._get_output(add_batch)
         return torch.baddbmm(self.alpha, add_batch, self.beta,
-                batch1, batch2, out=output)
+                             batch1, batch2, out=output)
 
     def backward(self, grad_output):
         batch1, batch2 = self.saved_tensors
@@ -120,7 +120,7 @@ class Addmv(_BlasBase):
         self.save_for_backward(matrix, vector)
         output = self._get_output(add_vector)
         return torch.addmv(self.alpha, add_vector, self.beta,
-                matrix, vector, out=output)
+                           matrix, vector, out=output)
 
     def backward(self, grad_output):
         matrix, vector = self.saved_tensors
@@ -150,7 +150,7 @@ class Addr(_BlasBase):
         self.save_for_backward(vector1, vector2)
         output = self._get_output(add_matrix)
         return torch.addr(self.alpha, add_matrix, self.beta,
-                vector1, vector2, out=output)
+                          vector1, vector2, out=output)
 
     def backward(self, grad_output):
         vector1, vector2 = self.saved_tensors
@@ -199,4 +199,3 @@ class Dot(Function):
 # TODO: trace
 # TODO: tril
 # TODO: triu
-
diff --git a/torch/autograd/_functions/linalg.py b/torch/autograd/_functions/linalg.py
index 93c69062f8..7c610b56e7 100644
--- a/torch/autograd/_functions/linalg.py
+++ b/torch/autograd/_functions/linalg.py
@@ -42,4 +42,3 @@ class Triu(Function):
         return grad_output.triu(self.diagonal_idx)
 
 # TODO: trace
-
diff --git a/torch/autograd/_functions/pointwise.py b/torch/autograd/_functions/pointwise.py
index c9b95628ae..cdc3db9e96 100644
--- a/torch/autograd/_functions/pointwise.py
+++ b/torch/autograd/_functions/pointwise.py
@@ -165,6 +165,7 @@ class Tan(Function):
 
 
 class Asin(Function):
+
     def forward(self, i):
         self.save_for_backward(i)
         return i.asin()
@@ -175,6 +176,7 @@ class Asin(Function):
 
 
 class Acos(Function):
+
     def forward(self, i):
         self.save_for_backward(i)
         return i.acos()
@@ -185,6 +187,7 @@ class Acos(Function):
 
 
 class Atan(Function):
+
     def forward(self, i):
         self.save_for_backward(i)
         return i.atan()
diff --git a/torch/autograd/_functions/reduce.py b/torch/autograd/_functions/reduce.py
index 314ae3aef5..07253cc42f 100644
--- a/torch/autograd/_functions/reduce.py
+++ b/torch/autograd/_functions/reduce.py
@@ -4,6 +4,7 @@ from ..function import Function
 
 
 class _DimReduceFunction(Function):
+
     def __init__(self, dim=None):
         super(_DimReduceFunction, self).__init__()
         self.dim = dim
@@ -139,6 +140,7 @@ class Kthvalue(_SelectionFunction):
 
 
 class Norm(Function):
+
     def __init__(self, norm_type=2, dim=None):
         super(Norm, self).__init__()
         self.norm_type = norm_type
diff --git a/torch/autograd/_functions/stochastic.py b/torch/autograd/_functions/stochastic.py
index 2290e35245..4a4eb64d2e 100644
--- a/torch/autograd/_functions/stochastic.py
+++ b/torch/autograd/_functions/stochastic.py
@@ -65,7 +65,7 @@ class Normal(StochasticFunction):
             output.mul_(stddevs)
         else:
             raise RuntimeError("Normal function requires specifying a common "
-                "stddev, or per-sample stddev")
+                               "stddev, or per-sample stddev")
         output.add_(means)
         self.save_for_backward(output, means, stddevs)
         self.mark_non_differentiable(output)
@@ -74,7 +74,7 @@ class Normal(StochasticFunction):
     def backward(self, reward):
         output, means, stddevs = self.saved_tensors
         grad_stddevs = None
-        grad_means = means - output # == -(output - means)
+        grad_means = means - output  # == -(output - means)
         assert self.stddev is not None or stddevs is not None
         if self.stddev is not None:
             grad_means /= 1e-6 + self.stddev ** 2
@@ -88,4 +88,3 @@ class Normal(StochasticFunction):
             grad_means /= stddevs_sq
         grad_means *= reward
         return grad_means, grad_stddevs
-
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
index fa9d81167e..d52eac50a6 100644
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@@ -103,6 +103,7 @@ class View(Function):
 
 
 class Expand(Function):
+
     def __init__(self, sizes):
         super(Expand, self).__init__()
         self.sizes = sizes
@@ -110,8 +111,8 @@ class Expand(Function):
 
     def forward(self, i):
         self.expanded_dims = [dim for dim, (expanded, original)
-                in enumerate(zip(self.sizes, i.size()))
-                if expanded != original]
+                              in enumerate(zip(self.sizes, i.size()))
+                              if expanded != original]
         result = i.expand(*self.sizes)
         self.mark_shared_storage((i, result))
         return result
@@ -304,8 +305,8 @@ class Concat(Function):
         return torch.cat(inputs, self.dim)
 
     def backward(self, grad_output):
-        return tuple(grad_output.narrow(self.dim, end-size, size) for size, end
-                in zip(self.input_sizes, _accumulate(self.input_sizes)))
+        return tuple(grad_output.narrow(self.dim, end - size, size) for size, end
+                     in zip(self.input_sizes, _accumulate(self.input_sizes)))
 
 
 class Resize(Function):
@@ -318,11 +319,11 @@ class Resize(Function):
     def forward(self, tensor):
         if tensor.numel() != self.numel:
             raise RuntimeError(("requested resize to {} ({} elements in total), "
-                    "but the given tensor has a size of {} ({} elements). "
-                    "autograd's resize can only change the shape of a given "
-                    "tensor, while preserving the number of elements. ").format(
-                        'x'.join(map(str, self.sizes)), self.numel,
-                        'x'.join(map(str, tensor.size())), tensor.numel()))
+                                "but the given tensor has a size of {} ({} elements). "
+                                "autograd's resize can only change the shape of a given "
+                                "tensor, while preserving the number of elements. ").format(
+                'x'.join(map(str, self.sizes)), self.numel,
+                'x'.join(map(str, tensor.size())), tensor.numel()))
         self.input_sizes = tensor.size()
         result = tensor.new(tensor).resize_(*self.sizes)
         self.mark_shared_storage((tensor, result))
@@ -493,7 +494,7 @@ class Topk(_MultiSelectionFunction):
         self.sort = sort
 
     def forward(self, input):
-        dim = self.dim if self.dim is not None else input.dim()-1
+        dim = self.dim if self.dim is not None else input.dim() - 1
         self.args = (self.k, dim, self.largest, self.sort)
         return super(Topk, self).forward(input)
 
diff --git a/torch/autograd/engine.py b/torch/autograd/engine.py
index fd264bd85c..0865aca6df 100644
--- a/torch/autograd/engine.py
+++ b/torch/autograd/engine.py
@@ -71,8 +71,8 @@ class BasicEngine(object):
                     else:
                         if prev_fn.num_outputs != 1:
                             raise RuntimeError("one of the function outputs "
-                                    "wasn't used - this is an error not, but "
-                                    "it's going to be fixed soon")
+                                               "wasn't used - this is an error not, but "
+                                               "it's going to be fixed soon")
                         prev_grad = (d_prev_fn,)
                     ready.appendleft((prev_fn, prev_grad))
                 else:
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 74bc5024ac..e31d2758f8 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -154,9 +154,10 @@ def _nested_map(condition, fn):
             return type(obj)(_map(x) for x in obj)
         else:
             raise ValueError("NestedIOFunction doesn't know how to process "
-                "an input object of type " + torch.typename(obj))
+                             "an input object of type " + torch.typename(obj))
     return _map
 
+
 def _iter_filter(condition):
     def _iter(obj):
         if condition(obj):
@@ -169,7 +170,7 @@ def _iter_filter(condition):
                     yield var
         else:
             raise ValueError("NestedIOFunction doesn't know how to process "
-                "an input object of type " + torch.typename(obj))
+                             "an input object of type " + torch.typename(obj))
     return _iter
 
 
@@ -178,8 +179,10 @@ _iter_tensors = _iter_filter(torch.is_tensor)
 _iter_None_tensors = _iter_filter(lambda o: o is None or torch.is_tensor(o))
 _map_variable_tensor = _nested_map(lambda o: isinstance(o, torch.autograd.Variable), lambda o: o.data)
 
+
 def _map_tensor_fromiter(itr):
-     return _nested_map(lambda o: torch.is_tensor(o), lambda o: next(itr))
+    return _nested_map(lambda o: torch.is_tensor(o), lambda o: next(itr))
+
 
 class NestedIOFunction(Function):
 
diff --git a/torch/autograd/stochastic_function.py b/torch/autograd/stochastic_function.py
index 74d598263a..cc81248727 100644
--- a/torch/autograd/stochastic_function.py
+++ b/torch/autograd/stochastic_function.py
@@ -2,6 +2,7 @@ from .function import Function
 
 _NOT_PROVIDED = object()
 
+
 class StochasticFunction(Function):
 
     def __init__(self):
@@ -10,7 +11,7 @@ class StochasticFunction(Function):
     def _do_backward(self, grad_output, retain_variables):
         if self.reward is _NOT_PROVIDED:
             raise RuntimeError("differentiating stochastic functions requires "
-                    "providing a reward")
+                               "providing a reward")
         result = super(StochasticFunction, self)._do_backward((self.reward,), retain_variables)
         if not retain_variables:
             self.reward = None
@@ -18,4 +19,3 @@ class StochasticFunction(Function):
 
     def _reinforce(self, reward):
         self.reward = reward
-
diff --git a/torch/autograd/variable.py b/torch/autograd/variable.py
index c48957a5a3..e03d8c1eef 100644
--- a/torch/autograd/variable.py
+++ b/torch/autograd/variable.py
@@ -72,12 +72,12 @@ class Variable(_C._VariableBase):
         if self.creator is not None:
             if value is False:
                 hint = (" If you want to use a computed variable in a subgraph "
-                    "that doesn't require differentiation use "
-                    "var_no_grad = var.detach().")
+                        "that doesn't require differentiation use "
+                        "var_no_grad = var.detach().")
             else:
                 hint = ''
             raise RuntimeError("you can only change requires_grad flags of "
-                    "leaf variables." + hint)
+                               "leaf variables." + hint)
         self._requires_grad = value
 
     def __getattr__(self, name):
@@ -87,13 +87,13 @@ class Variable(_C._VariableBase):
 
     def __getitem__(self, key):
         if (isinstance(key, Variable) and
-            type(key.data).__name__ == 'ByteTensor'):
+                type(key.data).__name__ == 'ByteTensor'):
             return MaskedSelect()(self, key)
         return Index(key)(self)
 
     def __setitem__(self, key, value):
         if (isinstance(key, Variable) and
-            type(key.data).__name__ == 'ByteTensor'):
+                type(key.data).__name__ == 'ByteTensor'):
             if isinstance(value, Variable):
                 return MaskedCopy(inplace=True)(self, key, value)
             else:
@@ -107,9 +107,9 @@ class Variable(_C._VariableBase):
     def __deepcopy__(self, memo):
         if self.creator is not None:
             raise RuntimeError("Only Variables created explicitly by the user "
-                    "(graph leaves) support the deepcopy protocol at the moment")
+                               "(graph leaves) support the deepcopy protocol at the moment")
         result = type(self)(self.data.clone(), requires_grad=self.requires_grad,
-                volatile=self.volatile)
+                            volatile=self.volatile)
         memo[id(self)] = result
         return result
 
@@ -151,7 +151,8 @@ class Variable(_C._VariableBase):
             raise RuntimeError('calling backward on a volatile variable')
         if gradient is None and self.requires_grad:
             if self.data.numel() != 1:
-                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+                raise RuntimeError(
+                    'backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
             gradient = self.data.new().resize_as_(self.data).fill_(1)
         self._execution_engine.run_backward((self,), (gradient,), retain_variables)
 
@@ -219,7 +220,7 @@ class Variable(_C._VariableBase):
         """
         if not isinstance(self.creator, StochasticFunction):
             raise RuntimeError("reinforce() can be only called on outputs "
-                    "of stochastic functions")
+                               "of stochastic functions")
         self.creator._reinforce(reward)
 
     def detach(self):
@@ -392,7 +393,7 @@ class Variable(_C._VariableBase):
     def clamp(self, min=None, max=None):
         if min is None and max is None:
             raise ValueError("clamp requires specifying at least one of "
-                "min and max arguments")
+                             "min and max arguments")
         elif min is None and max is not None:
             return CminConstant(max)(self)
         elif min is not None and max is None:
@@ -503,7 +504,7 @@ class Variable(_C._VariableBase):
 
     def bmm(self, batch):
         output = Variable(self.data.new(self.data.size(0), self.data.size(1),
-                batch.data.size(2)))
+                                        batch.data.size(2)))
         return self._static_blas(Baddbmm, (output, 0, 1, self, batch), False)
 
     def mv(self, vector):
@@ -622,7 +623,7 @@ class Variable(_C._VariableBase):
         if isinstance(sizes[0], torch.Size):
             if len(sizes) > 1:
                 raise ValueError("expand expects a several ints or a single "
-                        "torch.Size argument")
+                                 "torch.Size argument")
             sizes = sizes[0]
         return Expand(sizes)(self)
 
@@ -641,7 +642,7 @@ class Variable(_C._VariableBase):
 
     def narrow(self, dim, start_index, length):
         index = tuple(slice(None, None) for _ in range(dim)) + \
-                    (slice(start_index, start_index+length),)
+            (slice(start_index, start_index + length),)
 
         return Index(index)(self)
 
@@ -710,7 +711,7 @@ class Variable(_C._VariableBase):
         elif dim_self == 2 and dim_other == 2:
             return self.mm(other)
         raise ValueError("both arguments to __matmul__ need to be 1D or 2D, "
-                "but they are {}D and {}D".format(dim_self, dim_other))
+                         "but they are {}D and {}D".format(dim_self, dim_other))
 
     def __div__(self, other):
         return self.div(other)
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 4fda6123c3..8c2344ff81 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -20,6 +20,7 @@ elif sys.platform == 'darwin':
 else:
     libnames = []
 
+
 def _loadlib():
     global lib
     loaded = False
@@ -39,6 +40,7 @@ def _loadlib():
         lib = None
         raise OSError("Could not load cuDNN")
 
+
 def is_acceptable(tensor):
     if not enabled:
         return False
@@ -58,13 +60,15 @@ def is_acceptable(tensor):
             return False
     if not _C.has_cudnn:
         warnings.warn("cuDNN library has been detected, but your pytorch "
-                "installation was compiled without support for it. You "
-                "might want to rebuild pytorch, making sure the library "
-                "is visible to the build system.")
+                      "installation was compiled without support for it. You "
+                      "might want to rebuild pytorch, making sure the library "
+                      "is visible to the build system.")
         return False
     return True
 
 __cudnn_version = []
+
+
 def version():
     if not lib:
         raise RuntimeError("cuDNN not initialized")
@@ -108,7 +112,9 @@ CUDNN_GRU = 3
 CUDNN_LINEAR_INPUT = 0
 CUDNN_SKIP_INPUT = 1
 
+
 class CuDNNHandle:
+
     def __init__(self):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreate(ctypes.byref(ptr)))
@@ -117,7 +123,9 @@ class CuDNNHandle:
     def __del__(self):
         check_error(lib.cudnnDestroy(self))
 
+
 class CuDNNError(RuntimeError):
+
     def __init__(self, status):
         self.status = status
         msg = '{}: {}'.format(status, get_error_string(status))
@@ -125,6 +133,7 @@ class CuDNNError(RuntimeError):
 
 
 class TensorDescriptor(object):
+
     def __init__(self):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreateTensorDescriptor(ctypes.byref(ptr)))
@@ -147,6 +156,7 @@ class TensorDescriptor(object):
 
 
 class TensorDescriptorArray(object):
+
     def __init__(self, N):
         self.ptrs = (ctypes.c_void_p * N)()
         for i in range(N):
@@ -175,6 +185,7 @@ class TensorDescriptorArray(object):
 
 
 class ConvolutionDescriptor(object):
+
     def __init__(self):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreateConvolutionDescriptor(ctypes.byref(ptr)))
@@ -195,7 +206,9 @@ class ConvolutionDescriptor(object):
     def as_tuple(self):
         return (self._pad, self._stride)
 
+
 class FilterDescriptor(object):
+
     def __init__(self):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreateFilterDescriptor(ctypes.byref(ptr)))
@@ -216,6 +229,7 @@ class FilterDescriptor(object):
 
 
 class DropoutDescriptor(object):
+
     def __init__(self, handle, dropout, seed):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreateDropoutDescriptor(ctypes.byref(ptr)))
@@ -241,10 +255,10 @@ class DropoutDescriptor(object):
         check_error(lib.cudnnDestroyDropoutDescriptor(self))
 
 
-
 class RNNDescriptor(object):
+
     def __init__(self, hidden_size, num_layers, dropout_desc, input_mode,
-            bidirectional, mode, datatype):
+                 bidirectional, mode, datatype):
         ptr = ctypes.c_void_p()
         check_error(lib.cudnnCreateRNNDescriptor(ctypes.byref(ptr)))
         self._as_parameter_ = ptr
@@ -272,13 +286,16 @@ class ConvolutionAlgoPerf(ctypes.Structure):
         ("memory", ctypes.c_size_t),
     ]
 
+
 def check_error(status):
     if status is not 0:
         raise CuDNNError(status)
 
+
 def get_error_string(status):
     return lib.cudnnGetErrorString(status)
 
+
 def get_handle():
     if lib is None:
         _loadlib()
@@ -296,11 +313,12 @@ _typemap = {
 }
 
 _sizeofmap = {
-    CUDNN_DATA_HALF : 2,
-    CUDNN_DATA_FLOAT : 4,
-    CUDNN_DATA_DOUBLE : 8,
+    CUDNN_DATA_HALF: 2,
+    CUDNN_DATA_FLOAT: 4,
+    CUDNN_DATA_DOUBLE: 8,
 }
 
+
 def c_type(tensor):
     if isinstance(tensor, torch.cuda.HalfTensor):
         return ctypes.c_float
@@ -311,10 +329,12 @@ def c_type(tensor):
     else:
         raise ValueError("unknown type '{}'".format(type(tensor)))
 
+
 def int_array(itr):
     array_type = ctypes.c_int * len(itr)
     return array_type(*itr)
 
+
 def descriptor(tensor, N=None):
     if N is not None:
         descriptor = TensorDescriptorArray(N)
@@ -331,9 +351,11 @@ _autotuner_forward = {}
 _autotuner_backward_data = {}
 _autotuner_backward_filter = {}
 
+
 def convolution_autotuner_key(idesc, weight_desc, conv_desc):
     return (idesc.as_tuple(), weight_desc.as_tuple(), conv_desc.as_tuple())
 
+
 def convolution_forward_algorithm(idesc, weight_desc, conv_desc, odesc):
     k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
     if k in _autotuner_forward:
@@ -360,15 +382,19 @@ def convolution_forward_algorithm(idesc, weight_desc, conv_desc, odesc):
         wlimit, ctypes.byref(fwd_alg)))
     return fwd_alg
 
+
 def convolution_forward_workspace_size(*args):
     check_error(lib.cudnnGetConvolutionForwardWorkspaceSize(*args))
 
+
 def convolution_forward(*args):
     check_error(lib.cudnnConvolutionForward(*args))
 
+
 def convolution_backward_data(*args):
     return check_error(lib.cudnnConvolutionBackwardData(*args))
 
+
 def convolution_backward_data_algorithm(weight_desc, odesc, conv_desc, idesc):
     k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
     if k in _autotuner_backward_data:
@@ -395,12 +421,15 @@ def convolution_backward_data_algorithm(weight_desc, odesc, conv_desc, idesc):
         wlimit, ctypes.byref(bwd_data_alg)))
     return bwd_data_alg
 
+
 def convolution_backward_data_workspace_size(*args):
     return check_error(lib.cudnnGetConvolutionBackwardDataWorkspaceSize(*args))
 
+
 def convolution_backward_filter(*args):
     return check_error(lib.cudnnConvolutionBackwardFilter(*args))
 
+
 def convolution_backward_filter_algorithm(idesc, odesc, conv_desc, weight_desc):
     k = convolution_autotuner_key(idesc, weight_desc, conv_desc)
     if k in _autotuner_backward_filter:
@@ -427,11 +456,14 @@ def convolution_backward_filter_algorithm(idesc, odesc, conv_desc, weight_desc):
         wlimit, ctypes.byref(bwd_filter_alg)))
     return bwd_filter_alg
 
+
 def convolution_backward_filter_workspace_size(*args):
     return check_error(lib.cudnnGetConvolutionBackwardFilterWorkspaceSize(*args))
 
+
 def convolution_backward_bias(*args):
     check_error(lib.cudnnConvolutionBackwardBias(*args))
 
+
 def add_tensor(*args):
     check_error(lib.cudnnAddTensor(*args))
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
index 19ec7a288a..9d3cfce57d 100644
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@@ -3,6 +3,7 @@ import torch.backends.cudnn as cudnn
 from torch.backends.cudnn import check_error
 import ctypes
 
+
 def get_cudnn_mode(mode):
     if mode == 'RNN_RELU':
         return cudnn.CUDNN_RNN_RELU
@@ -17,9 +18,10 @@ def get_cudnn_mode(mode):
 
 
 class Unserializable(object):
+
     def __init__(self, inner):
         self.inner = inner
-    
+
     def get(self):
         return self.inner
 
@@ -39,6 +41,7 @@ def init_dropout_descriptor(fn, handle):
         fn.dropout_seed
     )
 
+
 def init_rnn_descriptor(fn):
     return cudnn.RNNDescriptor(
         fn.hidden_size,
@@ -161,7 +164,6 @@ def get_parameters(fn, handle, weight_buf):
 
                 cur_offset = offset + filter_dim_a[0]
 
-
         params.append(layer_params)
 
     return params
@@ -237,7 +239,7 @@ def forward(fn, input, hx, weight, output, hy):
 
         if tuple(hx.size()) != hidden_size:
             raise RuntimeError('Expected hidden size {}, got {}'.format(
-               hidden_size, tuple(hx.size())))
+                hidden_size, tuple(hx.size())))
         if cx is not None and tuple(cx.size()) != hidden_size:
             raise RuntimeError('Expected cell size {}, got {}'.format(
                 hidden_size, tuple(cx.size())))
@@ -295,7 +297,6 @@ def forward(fn, input, hx, weight, output, hy):
             output = output.transpose_(0, 1)
 
 
-
 def backward_grad(fn, input, hx, weight, output, grad_output, grad_hy, grad_input, grad_hx):
     with torch.cuda.device_of(input):
         handle = cudnn.get_handle()
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index cd947b5b24..fab8cbc36f 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -51,9 +51,9 @@ def _load_cudart():
         except OSError:
             pass
     raise RuntimeError("couldn't find libcudart. Make sure CUDA libraries "
-        "are installed in a default location, or that they're in " +
-        ("DYLD_LIBRARY_PATH" if system == 'Darwin' else "LD_LIBRARY_PATH") +
-        ".")
+                       "are installed in a default location, or that they're in " +
+                       ("DYLD_LIBRARY_PATH" if system == 'Darwin' else "LD_LIBRARY_PATH") +
+                       ".")
 
 
 def _check_driver():
@@ -259,67 +259,112 @@ class _CudaBase(object):
 
 class DoubleStorage(_CudaBase, torch._C.CudaDoubleStorageBase, _StorageBase):
     pass
+
+
 class FloatStorage(_CudaBase, torch._C.CudaFloatStorageBase, _StorageBase):
     pass
+
+
 class LongStorage(_CudaBase, torch._C.CudaLongStorageBase, _StorageBase):
     pass
+
+
 class IntStorage(_CudaBase, torch._C.CudaIntStorageBase, _StorageBase):
     pass
+
+
 class ShortStorage(_CudaBase, torch._C.CudaShortStorageBase, _StorageBase):
     pass
+
+
 class CharStorage(_CudaBase, torch._C.CudaCharStorageBase, _StorageBase):
     pass
+
+
 class ByteStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase):
     pass
+
+
 class HalfStorage(_CudaBase, torch._C.CudaHalfStorageBase, _StorageBase):
     pass
 
+
 class DoubleTensor(_CudaBase, torch._C.CudaDoubleTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return DoubleStorage
+
+
 class FloatTensor(_CudaBase, torch._C.CudaFloatTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return FloatStorage
+
+
 class LongTensor(_CudaBase, torch._C.CudaLongTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return LongStorage
+
+
 class IntTensor(_CudaBase, torch._C.CudaIntTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return IntStorage
+
+
 class ShortTensor(_CudaBase, torch._C.CudaShortTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type(cls):
         return ShortStorage
+
+
 class CharTensor(_CudaBase, torch._C.CudaCharTensorBase, _TensorBase):
+
     def is_signed(self):
         # TODO
         return False
+
     @classmethod
     def storage_type(cls):
         return CharStorage
+
+
 class ByteTensor(_CudaBase, torch._C.CudaByteTensorBase, _TensorBase):
+
     def is_signed(self):
         return False
+
     @classmethod
     def storage_type(cls):
         return ByteStorage
+
+
 class HalfTensor(_CudaBase, torch._C.CudaHalfTensorBase, _TensorBase):
+
     def is_signed(self):
         return True
+
     @classmethod
     def storage_type():
         return HalfStorage
diff --git a/torch/cuda/comm.py b/torch/cuda/comm.py
index d0607152bf..e6650a7bb6 100644
--- a/torch/cuda/comm.py
+++ b/torch/cuda/comm.py
@@ -4,6 +4,7 @@ from torch._utils import _accumulate
 
 # TODO: sync streams when implemented
 
+
 def broadcast(tensor, devices):
     """Broadcasts a tensor to a number of GPUs.
 
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index 78c1a01c92..5e9de84709 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -92,6 +92,7 @@ nccl_types = {
 
 
 class NcclError(RuntimeError):
+
     def __init__(self, status):
         self.status = status
         msg = '{0} ({1})'.format(status_codes.get(status), status)
@@ -103,6 +104,7 @@ class NcclComm(ctypes.c_void_p):
 
 
 class NcclCommList(object):
+
     def __init__(self, devices):
         self.devices = devices
         ptrs = (NcclComm * len(devices))()
@@ -141,7 +143,7 @@ def communicator(inputs, outputs=None):
 
 def cudaStream():
     # TODO: return the current stream
-    #ffi.C.THCState_getCurrentStream(cutorch.getState())
+    # ffi.C.THCState_getCurrentStream(cutorch.getState())
     return None
 
 
@@ -202,7 +204,7 @@ def all_gather(inputs, outputs):
 
 
 def reduce_scatter(inputs, outputs, op=SUM):
-    _check_inputs(inputs, outputs, 1.0/len(inputs))
+    _check_inputs(inputs, outputs, 1.0 / len(inputs))
     comm = communicator(inputs, outputs)
     count = inputs[0].numel() // len(inputs)
     data_type = nccl_types[inputs[0].type()]
diff --git a/torch/cuda/random.py b/torch/cuda/random.py
index 1b4bc54fbc..8d49587c1d 100644
--- a/torch/cuda/random.py
+++ b/torch/cuda/random.py
@@ -35,4 +35,3 @@ def seed_all():
 def initial_seed():
     _lazy_init()
     return _C._cuda_initialSeed()
-
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index a4cca7fe15..042c3e0cc7 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -8,6 +8,7 @@ ERROR_NOT_READY = 34
 
 
 class CudaError(RuntimeError):
+
     def __init__(self, code):
         msg = cudart().cudaGetErrorString(code).decode('utf-8')
         super(CudaError, self).__init__('{0} ({1})'.format(msg, code))
diff --git a/torch/functional.py b/torch/functional.py
index a26cda6283..bcbceffca8 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1,16 +1,18 @@
 import torch
 from ._utils import _range
 
+
 def split(tensor, split_size, dim=0):
     if dim < 0:
         dim += tensor.dim()
     dim_size = tensor.size(dim)
     num_splits = (dim_size + split_size - 1) // split_size
     last_split_size = split_size - (split_size * num_splits - dim_size)
+
     def get_split_size(i):
-        return split_size if i < num_splits-1 else last_split_size
-    return tuple(tensor.narrow(int(dim), int(i*split_size), int(get_split_size(i))) for i
-            in _range(0, num_splits))
+        return split_size if i < num_splits - 1 else last_split_size
+    return tuple(tensor.narrow(int(dim), int(i * split_size), int(get_split_size(i))) for i
+                 in _range(0, num_splits))
 
 
 def chunk(tensor, n_chunks, dim=0):
diff --git a/torch/legacy/nn/Abs.py b/torch/legacy/nn/Abs.py
index 475180af66..4b61c32041 100644
--- a/torch/legacy/nn/Abs.py
+++ b/torch/legacy/nn/Abs.py
@@ -1,24 +1,25 @@
 import torch
 from .Module import Module
 
+
 class Abs(Module):
+
     def __init__(self):
         super(Abs, self).__init__()
 
     def updateOutput(self, input):
         self._backend.Abs_updateOutput(
-           self._backend.library_state,
-           input,
-           self.output
+            self._backend.library_state,
+            input,
+            self.output
         )
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         self._backend.Abs_updateGradInput(
-           self._backend.library_state,
-           input,
-           gradOutput,
-           self.gradInput
+            self._backend.library_state,
+            input,
+            gradOutput,
+            self.gradInput
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/AbsCriterion.py b/torch/legacy/nn/AbsCriterion.py
index 9c440faee0..a7cb79b69e 100644
--- a/torch/legacy/nn/AbsCriterion.py
+++ b/torch/legacy/nn/AbsCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class AbsCriterion(Criterion):
 
     def __init__(self, sizeAverage=True):
@@ -10,7 +11,7 @@ class AbsCriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.AbsCriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -21,7 +22,6 @@ class AbsCriterion(Criterion):
         self.output = self.output_tensor[0]
         return self.output
 
-
     def updateGradInput(self, input, target):
         self._backend.AbsCriterion_updateGradInput(
             self._backend.library_state,
@@ -31,4 +31,3 @@ class AbsCriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/Add.py b/torch/legacy/nn/Add.py
index 847c4cad1d..09847f9a5a 100644
--- a/torch/legacy/nn/Add.py
+++ b/torch/legacy/nn/Add.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class Add(Module):
 
     def __init__(self, inputSize, scalar=False):
@@ -19,16 +20,16 @@ class Add(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1./math.sqrt(self.bias.size(0))
+            stdv = 1. / math.sqrt(self.bias.size(0))
 
         self.bias.uniform_(-stdv, stdv)
 
     def updateOutput(self, input):
         self.output.resize_as_(input).copy_(input)
         if self.scalar:
-            self.output.add_(self.bias[0]);
+            self.output.add_(self.bias[0])
         else:
             batchSize = input.size(0)
             if self._ones.size(0) != batchSize:
@@ -42,16 +43,15 @@ class Add(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is not None:
-           self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-           return self.gradInput
+            self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
+            return self.gradInput
 
     def accGradParameters(self, input, gradOutput, scale=1):
         if self.gradBias.size(0) == 1:
-           self.gradBias[0] = self.gradBias[0] + scale*gradOutput.sum();
+            self.gradBias[0] = self.gradBias[0] + scale * gradOutput.sum()
         else:
-           if input.is_same_size(self.bias):
-              self.gradBias.add_(scale, gradOutput)
-           else:
-              gradOutput = gradOutput.view(input.size(0), -1)
-              self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
-
+            if input.is_same_size(self.bias):
+                self.gradBias.add_(scale, gradOutput)
+            else:
+                gradOutput = gradOutput.view(input.size(0), -1)
+                self.gradBias.view(-1).addmv_(scale, gradOutput.t(), self._ones)
diff --git a/torch/legacy/nn/AddConstant.py b/torch/legacy/nn/AddConstant.py
index 8582d947b8..4e9f10dcbf 100644
--- a/torch/legacy/nn/AddConstant.py
+++ b/torch/legacy/nn/AddConstant.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class AddConstant(Module):
 
     def __init__(self, constant_scalar, inplace=False):
@@ -29,4 +30,3 @@ class AddConstant(Module):
             self.gradInput.copy_(gradOutput)
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/BCECriterion.py b/torch/legacy/nn/BCECriterion.py
index 1e3642daa0..94ca51a0bc 100644
--- a/torch/legacy/nn/BCECriterion.py
+++ b/torch/legacy/nn/BCECriterion.py
@@ -2,6 +2,8 @@ import torch
 from .Criterion import Criterion
 
 # TODO: use THNN
+
+
 class BCECriterion(Criterion):
     eps = 1e-12
 
@@ -20,7 +22,7 @@ class BCECriterion(Criterion):
             raise RuntimeError("input and target size mismatch")
 
         if self.buffer is None:
-              self.buffer = input.new()
+            self.buffer = input.new()
 
         buffer = self.buffer
         weights = self.weights
@@ -38,7 +40,7 @@ class BCECriterion(Criterion):
         output = torch.dot(target, buffer)
 
         # log(1 - input) * (1 - target)
-        torch.mul(input, -1, out=buffer).add_(1+self.eps).log_()
+        torch.mul(input, -1, out=buffer).add_(1 + self.eps).log_()
         if weights is not None:
             buffer.mul_(weights)
 
@@ -52,42 +54,39 @@ class BCECriterion(Criterion):
 
         return self.output
 
-
     def updateGradInput(self, input, target):
          # - (target - input) / ( input (1 - input) )
          # The gradient is slightly incorrect:
          # It should have be divided by (input + self.eps) (1 - input + self.eps)
          # but it is divided by input (1 - input + self.eps) + self.eps
          # This modification requires less memory to be computed.
-         if input.nelement() != target.nelement():
+        if input.nelement() != target.nelement():
             raise RuntimeError("input and target size mismatch")
 
-         if self.buffer is None:
-              self.buffer = input.new()
-
-         buffer = self.buffer
-         weights = self.weights
-         gradInput = self.gradInput
-
-         if weights is not None and target.dim() != 1:
-             weights = self.weights.view(1, target.size(1)).expand_as(target)
+        if self.buffer is None:
+            self.buffer = input.new()
 
+        buffer = self.buffer
+        weights = self.weights
+        gradInput = self.gradInput
 
-         buffer.resize_as_(input)
-         # - x ( 1 + self.eps -x ) + self.eps
-         torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
+        if weights is not None and target.dim() != 1:
+            weights = self.weights.view(1, target.size(1)).expand_as(target)
 
-         gradInput.resize_as_(input)
-         # y - x
-         torch.add(target, -1, input, out=gradInput)
-         # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
-         gradInput.div_(buffer)
+        buffer.resize_as_(input)
+        # - x ( 1 + self.eps -x ) + self.eps
+        torch.add(input, -1, out=buffer).add_(-self.eps).mul_(input).add_(-self.eps)
 
-         if weights is not None:
-             gradInput.mul_(weights)
+        gradInput.resize_as_(input)
+        # y - x
+        torch.add(target, -1, input, out=gradInput)
+        # - (y - x) / ( x ( 1 + self.eps -x ) + self.eps )
+        gradInput.div_(buffer)
 
-         if self.sizeAverage:
-             gradInput.div_(target.nelement())
+        if weights is not None:
+            gradInput.mul_(weights)
 
-         return gradInput
+        if self.sizeAverage:
+            gradInput.div_(target.nelement())
 
+        return gradInput
diff --git a/torch/legacy/nn/BatchNormalization.py b/torch/legacy/nn/BatchNormalization.py
index f7a18e8d24..3c17c6481a 100644
--- a/torch/legacy/nn/BatchNormalization.py
+++ b/torch/legacy/nn/BatchNormalization.py
@@ -32,6 +32,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class BatchNormalization(Module):
     # expected dimension of input
     nDim = 2
@@ -51,44 +52,45 @@ class BatchNormalization(Module):
         self.save_std = None
 
         if self.affine:
-           self.weight = torch.Tensor(nOutput)
-           self.bias = torch.Tensor(nOutput)
-           self.gradWeight = torch.Tensor(nOutput)
-           self.gradBias = torch.Tensor(nOutput)
-           self.reset()
+            self.weight = torch.Tensor(nOutput)
+            self.bias = torch.Tensor(nOutput)
+            self.gradWeight = torch.Tensor(nOutput)
+            self.gradBias = torch.Tensor(nOutput)
+            self.reset()
         else:
-           self.weight = None
-           self.bias = None
-           self.gradWeight = None
-           self.gradBias = None
+            self.weight = None
+            self.bias = None
+            self.gradWeight = None
+            self.gradBias = None
 
     def reset(self):
         if self.weight is not None:
-           self.weight.uniform_()
+            self.weight.uniform_()
 
         if self.bias is not None:
-           self.bias.zero_()
+            self.bias.zero_()
 
         self.running_mean.zero_()
         self.running_var.fill_(1)
 
     def _checkInputDim(self, input):
         if input.dim() != self.nDim:
-            raise RuntimeError('only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
+            raise RuntimeError(
+                'only mini-batch supported ({}D tensor), got {}D tensor instead'.format(self.nDim, input.dim()))
         if input.size(1) != self.running_mean.nelement():
             raise RuntimeError('got {}-feature tensor, expected {}'.format(input.size(1), self.running_mean.nelement()))
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
             if self._input is None:
-                  self._input = input.new()
+                self._input = input.new()
             self._input.resize_as_(input).copy_(input)
             input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
                 if self._gradOutput is None:
-                      self._gradOutput = gradOutput.new()
+                    self._gradOutput = gradOutput.new()
                 self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                 gradOutput = self._gradOutput
 
@@ -101,10 +103,10 @@ class BatchNormalization(Module):
 
         self.output.resize_as_(input)
         if self.save_mean is None:
-              self.save_mean = input.new()
+            self.save_mean = input.new()
         self.save_mean.resize_as_(self.running_mean)
         if self.save_std is None:
-              self.save_std = input.new()
+            self.save_std = input.new()
         self.save_std.resize_as_(self.running_var)
 
         self._backend.BatchNormalization_updateOutput(
@@ -124,7 +126,6 @@ class BatchNormalization(Module):
 
         return self.output
 
-
     def _backward(self, input, gradOutput, scale, gradInput=None, gradWeight=None, gradBias=None):
         self._checkInputDim(input)
         self._checkInputDim(gradOutput)
@@ -135,8 +136,7 @@ class BatchNormalization(Module):
 
         scale = scale or 1.
         if gradInput is not None:
-           gradInput.resize_as_(gradOutput)
-
+            gradInput.resize_as_(gradOutput)
 
         self._backend.BatchNormalization_backward(
             self._backend.library_state,
@@ -177,15 +177,14 @@ class BatchNormalization(Module):
         # first 5 buffers are not present in the current implementation,
         # but we keep them for cleaning old saved models
         clear(self, [
-           'buffer',
-           'buffer2',
-           'centered',
-           'std',
-           'normalized',
-           '_input',
-           '_gradOutput',
-           'save_mean',
-           'save_std',
+            'buffer',
+            'buffer2',
+            'centered',
+            'std',
+            'normalized',
+            '_input',
+            '_gradOutput',
+            'save_mean',
+            'save_std',
         ])
         return super(BatchNormalization, self).clearState()
-
diff --git a/torch/legacy/nn/Bilinear.py b/torch/legacy/nn/Bilinear.py
index 2d699216b0..e4e0049262 100644
--- a/torch/legacy/nn/Bilinear.py
+++ b/torch/legacy/nn/Bilinear.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Bilinear(Module):
 
     def _assertInput(self, input):
@@ -23,14 +24,13 @@ class Bilinear(Module):
         if gradOutput.size(1) != self.weight.size(0):
             raise RuntimeError('number of columns in gradOutput does not match layer\'s output size')
 
-
     def __init__(self, inputSize1, inputSize2, outputSize, bias=True):
         # set up model:
         super(Bilinear, self).__init__()
-        self.weight     = torch.Tensor(outputSize, inputSize1, inputSize2)
+        self.weight = torch.Tensor(outputSize, inputSize1, inputSize2)
         self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
         if bias:
-            self.bias     = torch.Tensor(outputSize)
+            self.bias = torch.Tensor(outputSize)
             self.gradBias = torch.Tensor(outputSize)
         else:
             self.bias = None
@@ -53,13 +53,12 @@ class Bilinear(Module):
             self.bias.uniform_(-stdv, stdv)
         return self
 
-
     def updateOutput(self, input):
         self._assertInput(input)
 
         # set up buffer:
         if self.buff2 is None:
-              self.buff2 = input[0].new()
+            self.buff2 = input[0].new()
         self.buff2.resize_as_(input[1])
 
         # compute output scores:
@@ -74,7 +73,6 @@ class Bilinear(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
             return
@@ -87,38 +85,36 @@ class Bilinear(Module):
         #: first slice of weight tensor (k = 1)
         self.gradInput[0].addmm_(input[1], self.weight[0].t())
         self.gradInput[0].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[0].size(0),
-            self.gradInput[0].size(1)))
+                                                                 self.gradInput[0].size(1)))
         self.gradInput[1].addmm_(input[0], self.weight[0])
         self.gradInput[1].mul_(gradOutput.narrow(1, 0, 1).expand(self.gradInput[1].size(0),
-            self.gradInput[1].size(1)))
+                                                                 self.gradInput[1].size(1)))
 
         #: remaining slices of weight tensor
         if self.weight.size(0) > 1:
             if self.buff1 is None:
-                  self.buff1 = input[0].new()
+                self.buff1 = input[0].new()
             self.buff1.resize_as_(input[0])
 
             for k in range(1, self.weight.size(0)):
                 torch.mm(input[1], self.weight[k].t(), out=self.buff1)
                 self.buff1.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[0].size(0),
-                    self.gradInput[0].size(1)))
+                                                                  self.gradInput[0].size(1)))
                 self.gradInput[0].add_(self.buff1)
 
                 torch.mm(input[0], self.weight[k], out=self.buff2)
                 self.buff2.mul_(gradOutput.narrow(1, k, 1).expand(self.gradInput[1].size(0),
-                    self.gradInput[1].size(1)))
+                                                                  self.gradInput[1].size(1)))
                 self.gradInput[1].add_(self.buff2)
 
         return self.gradInput
 
-
-
     def accGradParameters(self, input, gradOutput, scale=1):
         self._assertInputGradOutput(input, gradOutput)
 
         # make sure we have buffer:
         if self.buff1 is None:
-              self.buff1 = input[0].new()
+            self.buff1 = input[0].new()
         self.buff1.resize_as_(input[0])
 
         # accumulate parameter gradients:
@@ -129,15 +125,13 @@ class Bilinear(Module):
         if self.bias is not None:
             self.gradBias.add_(scale, gradOutput.sum(0))
 
-
     def __repr__(self):
         return str(type(self)) + \
-                '({}x{} -> {}) {}'.format(
-                    self.weight.size(1), self.weight.size(2), self.weight.size(0),
-                    (' without bias' if self.bias is None else '')
-                )
+            '({}x{} -> {}) {}'.format(
+            self.weight.size(1), self.weight.size(2), self.weight.size(0),
+            (' without bias' if self.bias is None else '')
+        )
 
     def clearState(self):
         clear(self, 'buff1', 'buff2')
         return super(Bilinear, self).clearState()
-
diff --git a/torch/legacy/nn/CAddTable.py b/torch/legacy/nn/CAddTable.py
index 9b8481ec50..bcefa11f2a 100644
--- a/torch/legacy/nn/CAddTable.py
+++ b/torch/legacy/nn/CAddTable.py
@@ -1,25 +1,25 @@
 import torch
 from .Module import Module
 
+
 class CAddTable(Module):
+
     def __init__(self, inplace=False):
         super(CAddTable, self).__init__()
         self.inplace = inplace
         self.gradInput = []
 
-
     def updateOutput(self, input):
         if self.inplace:
-           self.output.set_(input[0])
+            self.output.set_(input[0])
         else:
-           self.output.resize_as_(input[0]).copy_(input[0])
+            self.output.resize_as_(input[0]).copy_(input[0])
 
         for i in range(1, len(input)):
-           self.output.add_(input[i])
+            self.output.add_(input[i])
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         for i in range(len(input)):
             if i >= len(self.gradInput):
@@ -34,4 +34,3 @@ class CAddTable(Module):
         del self.gradInput[len(input):]
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/CDivTable.py b/torch/legacy/nn/CDivTable.py
index 790944786f..c60a5bb927 100644
--- a/torch/legacy/nn/CDivTable.py
+++ b/torch/legacy/nn/CDivTable.py
@@ -1,7 +1,9 @@
 import torch
 from .Module import Module
 
+
 class CDivTable(Module):
+
     def __init__(self, ):
         super(CDivTable, self).__init__()
         self.gradInput = []
@@ -20,4 +22,3 @@ class CDivTable(Module):
         del self.gradInput[len(input):]
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/CMul.py b/torch/legacy/nn/CMul.py
index 40998ad43d..4880d25d34 100644
--- a/torch/legacy/nn/CMul.py
+++ b/torch/legacy/nn/CMul.py
@@ -4,6 +4,7 @@ import torch
 from .Module import Module
 from .utils import clear, contiguousView
 
+
 class CMul(Module):
 
     def __init__(self, *args):
@@ -33,11 +34,10 @@ class CMul(Module):
         if stdv is not None:
             stdv = stdv * math.sqrt(3)
         else:
-            stdv = 1./math.sqrt(self.weight.nelement())
+            stdv = 1. / math.sqrt(self.weight.nelement())
 
         self.weight.uniform_(-stdv, stdv)
 
-
     def updateOutput(self, input):
         # lazy-initialize
         if self._output is None:
@@ -61,10 +61,9 @@ class CMul(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
-           return
+            return
 
         if self._gradOutput is None:
             self._gradOutput = input.new()
@@ -85,7 +84,6 @@ class CMul(Module):
 
         return self.gradInput
 
-
     def accGradParameters(self, input, gradOutput, scale=1):
         if self._input is None:
             self._input = input.new()
@@ -103,17 +101,17 @@ class CMul(Module):
 
     def type(self, type=None, tensorCache=None):
         if type:
-           self.clearState()
+            self.clearState()
         return super(CMul, self).type(type, tensorCache)
 
     def clearState(self):
         clear(self, [
-           '_input',
-           '_output',
-           '_weight',
-           '_gradWeight',
-           '_expand',
-           '_repeat',
-           '_sum',
+            '_input',
+            '_output',
+            '_weight',
+            '_gradWeight',
+            '_expand',
+            '_repeat',
+            '_sum',
         ])
         return super(CMul, self).clearState()
diff --git a/torch/legacy/nn/CMulTable.py b/torch/legacy/nn/CMulTable.py
index f79114c33f..64a58f0c79 100644
--- a/torch/legacy/nn/CMulTable.py
+++ b/torch/legacy/nn/CMulTable.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class CMulTable(Module):
 
     def __init__(self, ):
@@ -17,7 +18,7 @@ class CMulTable(Module):
 
     def updateGradInput_efficient(self, input, gradOutput):
         if self.tout is None:
-              self.tout = input[0].new()
+            self.tout = input[0].new()
         self.tout.resize_as_(self.output)
         for i in range(len(input)):
             if len(self.gradInput) <= i:
diff --git a/torch/legacy/nn/CSubTable.py b/torch/legacy/nn/CSubTable.py
index b8ee7ab22d..85d8527f8c 100644
--- a/torch/legacy/nn/CSubTable.py
+++ b/torch/legacy/nn/CSubTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class CSubTable(Module):
 
     def __init__(self, ):
@@ -14,12 +15,11 @@ class CSubTable(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput[0] is None:
-              self.gradInput[0] = input[0].new()
+            self.gradInput[0] = input[0].new()
         if self.gradInput[1] is None:
-              self.gradInput[1] = input[1].new()
+            self.gradInput[1] = input[1].new()
         self.gradInput[0].resize_as_(input[0]).copy_(gradOutput)
         self.gradInput[1].resize_as_(input[1]).copy_(gradOutput).mul_(-1)
 
         self.gradInput = self.gradInput[:2]
         return self.gradInput
-
diff --git a/torch/legacy/nn/Clamp.py b/torch/legacy/nn/Clamp.py
index 90eec28bb0..0bfcac3266 100644
--- a/torch/legacy/nn/Clamp.py
+++ b/torch/legacy/nn/Clamp.py
@@ -1,6 +1,8 @@
 import torch
 from .HardTanh import HardTanh
 
+
 class Clamp(HardTanh):
+
     def __init__(self, min_value, max_value):
         super(Clamp, self,).__init__(min_value, max_value)
diff --git a/torch/legacy/nn/ClassNLLCriterion.py b/torch/legacy/nn/ClassNLLCriterion.py
index 60aec29f60..c9a67f7ff0 100644
--- a/torch/legacy/nn/ClassNLLCriterion.py
+++ b/torch/legacy/nn/ClassNLLCriterion.py
@@ -1,7 +1,9 @@
 import torch
 from .Criterion import Criterion
 
+
 class ClassNLLCriterion(Criterion):
+
     def __init__(self, weights=None, sizeAverage=True):
         super(ClassNLLCriterion, self).__init__()
         self.sizeAverage = sizeAverage
@@ -27,7 +29,6 @@ class ClassNLLCriterion(Criterion):
         self.output = self.output_tensor[0]
         return self.output
 
-
     def updateGradInput(self, input, target):
         self.gradInput.resize_as_(input).zero_()
         target = target.long()
diff --git a/torch/legacy/nn/ClassSimplexCriterion.py b/torch/legacy/nn/ClassSimplexCriterion.py
index 17bf77f354..a6e7329630 100644
--- a/torch/legacy/nn/ClassSimplexCriterion.py
+++ b/torch/legacy/nn/ClassSimplexCriterion.py
@@ -12,19 +12,20 @@ from .MSECriterion import MSECriterion
          Reference: http.//arxiv.org/abs/1506.08230
 """
 
+
 class ClassSimplexCriterion(MSECriterion):
 
     def __init__(self, nClasses):
-         super(ClassSimplexCriterion, self).__init__()
-         self.nClasses = nClasses
+        super(ClassSimplexCriterion, self).__init__()
+        self.nClasses = nClasses
 
-         # embedding the simplex in a space of dimension strictly greater than
-         # the minimum possible (nClasses-1) is critical for effective training.
-         simp = self._regsplex(nClasses - 1)
-         self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
-         self._target = torch.Tensor(nClasses)
+        # embedding the simplex in a space of dimension strictly greater than
+        # the minimum possible (nClasses-1) is critical for effective training.
+        simp = self._regsplex(nClasses - 1)
+        self.simplex = torch.cat((simp, torch.zeros(simp.size(0), nClasses - simp.size(1))), 1)
+        self._target = torch.Tensor(nClasses)
 
-         self.output_tensor = None
+        self.output_tensor = None
 
     def _regsplex(self, n):
         """
@@ -51,11 +52,11 @@ class ClassSimplexCriterion(MSECriterion):
             if k == 0:
                 a[k][k] = 1
             else:
-                a[k][k] = math.sqrt(1 - a[k:k+1, 0:k+1].norm()**2)
+                a[k][k] = math.sqrt(1 - a[k:k + 1, 0:k + 1].norm()**2)
 
             # fill_ the k-th coordinates for the vectors of the remaining vertices
-            c = (a[k][k]**2 - 1 - 1/n) / a[k][k]
-            a[k+1:n+2, k:k+1].fill_(c)
+            c = (a[k][k]**2 - 1 - 1 / n) / a[k][k]
+            a[k + 1:n + 2, k:k + 1].fill_(c)
 
         return a
 
@@ -69,20 +70,20 @@ class ClassSimplexCriterion(MSECriterion):
             self._target[i].copy_(self.simplex[int(target[i])])
 
     def updateOutput(self, input, target):
-         self._transformTarget(target)
+        self._transformTarget(target)
 
-         assert input.nelement() == self._target.nelement()
-         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
-         self._backend.MSECriterion_updateOutput(
+        assert input.nelement() == self._target.nelement()
+        if self.output_tensor is None:
+            self.output_tensor = input.new(1)
+        self._backend.MSECriterion_updateOutput(
             self._backend.library_state,
             input,
             self._target,
             self.output_tensor,
             self.sizeAverage
-         )
-         self.output = self.output_tensor[0]
-         return self.output
+        )
+        self.output = self.output_tensor[0]
+        return self.output
 
     def updateGradInput(self, input, target):
         assert input.nelement() == self._target.nelement()
@@ -100,6 +101,5 @@ class ClassSimplexCriterion(MSECriterion):
 
     def getTopPrediction(self, input):
         prod = self.getPredictions(input)
-        _, maxs = prod.max(prod.ndimension()-1)
+        _, maxs = prod.max(prod.ndimension() - 1)
         return maxs.view(-1)
-
diff --git a/torch/legacy/nn/Concat.py b/torch/legacy/nn/Concat.py
index e9e924ce93..cb54d7674c 100644
--- a/torch/legacy/nn/Concat.py
+++ b/torch/legacy/nn/Concat.py
@@ -1,6 +1,7 @@
 import torch
 from .Container import Container
 
+
 class Concat(Container):
 
     def __init__(self, dimension):
@@ -22,9 +23,9 @@ class Concat(Container):
 
         offset = 0
         for i, module in enumerate(self.modules):
-           currentOutput = outs[i]
-           self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
-           offset = offset + currentOutput.size(self.dimension)
+            currentOutput = outs[i]
+            self.output.narrow(self.dimension, offset, currentOutput.size(self.dimension)).copy_(currentOutput)
+            offset = offset + currentOutput.size(self.dimension)
 
         return self.output
 
@@ -34,9 +35,11 @@ class Concat(Container):
         offset = 0
         for i, module in enumerate(self.modules):
             currentOutput = module.output
-            currentGradInput = module.updateGradInput(input, gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)))
+            currentGradInput = module.updateGradInput(input, gradOutput.narrow(
+                self.dimension, offset, currentOutput.size(self.dimension)))
 
-            if currentGradInput: # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
+            # if the module does not produce a gradInput (for example first layer),: ignore it and move on.
+            if currentGradInput:
                 if i == 0:
                     self.gradInput.copy_(currentGradInput)
                 else:
@@ -46,24 +49,25 @@ class Concat(Container):
 
         return self.gradInput
 
-
     def accGradParameters(self, input, gradOutput, scale=1):
         offset = 0
         for i, module in enumerate(self.modules):
-           currentOutput = module.output
-           module.accGradParameters(
-               input,
-               gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-               scale)
-           offset = offset + currentOutput.size(self.dimension)
+            currentOutput = module.output
+            module.accGradParameters(
+                input,
+                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
+                scale)
+            offset = offset + currentOutput.size(self.dimension)
 
     def backward(self, input, gradOutput, scale=1):
         self.gradInput.resize_as_(input)
         offset = 0
         for i, module in enumerate(self.modules):
             currentOutput = module.output
-            currentGradInput = module.backward(input, gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)), scale)
-            if currentGradInput is not None: # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
+            currentGradInput = module.backward(input, gradOutput.narrow(
+                self.dimension, offset, currentOutput.size(self.dimension)), scale)
+            # if the module.es not produce a gradInput (for example first layer),: ignore it and move on.
+            if currentGradInput is not None:
                 if i == 0:
                     self.gradInput.copy_(currentGradInput)
                 else:
@@ -75,12 +79,12 @@ class Concat(Container):
     def accUpdateGradParameters(self, input, gradOutput, lr):
         offset = 0
         for i, module in enumerate(self.modules):
-           currentOutput = module.output
-           module.accUpdateGradParameters(
-               input,
-               gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
-               lr)
-           offset = offset + currentOutput.size(self.dimension)
+            currentOutput = module.output
+            module.accUpdateGradParameters(
+                input,
+                gradOutput.narrow(self.dimension, offset, currentOutput.size(self.dimension)),
+                lr)
+            offset = offset + currentOutput.size(self.dimension)
 
     def __tostring__(self):
         tab = '  '
@@ -92,7 +96,7 @@ class Concat(Container):
         res = torch.type(self)
         res += ' {' + line + tab + 'input'
         for i in range(len(self.modules)):
-            if i == len(self.modules)-1:
+            if i == len(self.modules) - 1:
                 res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
             else:
                 res += line + tab + next + '(' + i + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
diff --git a/torch/legacy/nn/ConcatTable.py b/torch/legacy/nn/ConcatTable.py
index 628b9595fe..afebf8c296 100644
--- a/torch/legacy/nn/ConcatTable.py
+++ b/torch/legacy/nn/ConcatTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Container import Container
 
+
 class ConcatTable(Container):
 
     def __init__(self, ):
@@ -23,7 +24,7 @@ class ConcatTable(Container):
                     l1[i] = res
             else:
                 f(l1, i, v)
-        for i in range(len(l1)-1, len(l2)-1, -1):
+        for i in range(len(l1) - 1, len(l2) - 1, -1):
             del l1[i]
         return l1
 
@@ -44,6 +45,7 @@ class ConcatTable(Container):
 
                 if i == 0:
                     self.gradInput = self.gradInput if wasTable else []
+
                     def fn(l, i, v):
                         if i >= len(l):
                             assert len(l) == i
@@ -82,11 +84,11 @@ class ConcatTable(Container):
 
     def accGradParameters(self, input, gradOutput, scale=1):
         for i, module in ipairs(self.modules):
-           self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
+            self.rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
 
     def accUpdateGradParameters(self, input, gradOutput, lr):
         for i, module in ipairs(self.modules):
-           self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
+            self.rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
 
     def __repr__(self):
         tab = '  '
@@ -98,14 +100,13 @@ class ConcatTable(Container):
         res = torch.typename(self)
         res = res + ' {' + line + tab + 'input'
         for i in range(len(self.modules)):
-           if i == len(self.modules)-1:
-              res = res + line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
-           else:
-              res = res + line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
+            if i == len(self.modules) - 1:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + ext)
 
         res = res + line + tab + last + 'output'
         res = res + line + '}'
         return res
-
-
diff --git a/torch/legacy/nn/Container.py b/torch/legacy/nn/Container.py
index dba0f78802..84a726e003 100644
--- a/torch/legacy/nn/Container.py
+++ b/torch/legacy/nn/Container.py
@@ -4,11 +4,12 @@ from .utils import clear
 from functools import wraps
 import sys
 
+
 class Container(Module):
 
     def __init__(self, *args):
-         super(Container, self).__init__(*args)
-         self.modules = []
+        super(Container, self).__init__(*args)
+        self.modules = []
 
     def add(self, module):
         self.modules.append(module)
@@ -18,11 +19,11 @@ class Container(Module):
         return self.modules[index]
 
     def size(self):
-         return len(self.modules)
+        return len(self.modules)
 
     def applyToModules(self, func):
-         for module in self.modules:
-             func(module)
+        for module in self.modules:
+            func(module)
 
     def zeroGradParameters(self):
         self.applyToModules(lambda m: m.zeroGradParameters())
@@ -46,16 +47,16 @@ class Container(Module):
         self.applyToModules(lambda m: m.reset(stdv))
 
     def parameters(self):
-         w = []
-         gw = []
-         for module in self.modules:
-             mparam = module.parameters()
-             if mparam is not None:
-                 w.extend(mparam[0])
-                 gw.extend(mparam[1])
-         if not w:
-             return
-         return w, gw
+        w = []
+        gw = []
+        for module in self.modules:
+            mparam = module.parameters()
+            if mparam is not None:
+                w.extend(mparam[0])
+                gw.extend(mparam[1])
+        if not w:
+            return
+        return w, gw
 
     def clearState(self):
         clear('output')
@@ -63,4 +64,3 @@ class Container(Module):
         for module in self.modules:
             module.clearState()
         return self
-
diff --git a/torch/legacy/nn/Contiguous.py b/torch/legacy/nn/Contiguous.py
index 0371ceb6f4..aacadb05e5 100644
--- a/torch/legacy/nn/Contiguous.py
+++ b/torch/legacy/nn/Contiguous.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Contiguous(Module):
 
     def updateOutput(self, input):
@@ -11,7 +12,6 @@ class Contiguous(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if not gradOutput.is_contiguous():
             self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
@@ -19,4 +19,3 @@ class Contiguous(Module):
             self.gradInput.set_(gradOutput)
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/Copy.py b/torch/legacy/nn/Copy.py
index 4f9a9c9c72..71c8682cc9 100644
--- a/torch/legacy/nn/Copy.py
+++ b/torch/legacy/nn/Copy.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Copy(Module):
 
     def __init__(self, intype, outtype, dontCast=False):
@@ -13,15 +14,12 @@ class Copy(Module):
         self.output.resize_(input.size()).copy_(input)
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self.gradInput.resize_(gradOutput.size()).copy_(gradOutput)
         return self.gradInput
 
-
     def type(self, type=None, tensorCache=None):
         if type and self.dontCast:
-           return self
+            return self
 
         return super(Copy, self).type(self, type, tensorCache)
-
diff --git a/torch/legacy/nn/Cosine.py b/torch/legacy/nn/Cosine.py
index 71888b3797..cda75c5467 100644
--- a/torch/legacy/nn/Cosine.py
+++ b/torch/legacy/nn/Cosine.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Cosine(Module):
 
     def __init__(self, inputSize, outputSize):
@@ -22,7 +23,7 @@ class Cosine(Module):
         if stdv is not None:
             stdv = stdv * math.sqrt(3)
         else:
-            stdv = 1./math.sqrt(self.weight.size(0))
+            stdv = 1. / math.sqrt(self.weight.size(0))
         self.weight.uniform_(-stdv, stdv)
 
     def updateOutput(self, input):
@@ -32,9 +33,9 @@ class Cosine(Module):
         outputSize = self.weight.size(0)
 
         if self._weightNorm is None:
-              self._weightNorm = self.weight.new()
+            self._weightNorm = self.weight.new()
         if self._inputNorm is None:
-              self._inputNorm = self.weight.new()
+            self._inputNorm = self.weight.new()
 
         # y_j = (w_j * x) / ( || w_j || * || x || )
 
@@ -53,12 +54,11 @@ class Cosine(Module):
         self.output.div_(self._inputNorm.expand_as(self.output))
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         assert input.dim() == 2
 
         if self.gradInput is None:
-           return
+            return
 
         inputSize = self.weight.size(1)
         outputSize = self.weight.size(0)
@@ -72,15 +72,15 @@ class Cosine(Module):
         nelement = self.gradInput.nelement()
         self.gradInput.resize_as_(input)
         if self.gradInput.nelement() != nelement:
-           self.gradInput.zero_()
+            self.gradInput.zero_()
 
         inputNorm = self._inputNorm.expand_as(input)
         weightNorm = self._weightNorm.view(1, outputSize).expand_as(gradOutput)
 
         if self._gradOutput is None:
-              self._gradOutput = gradOutput.new()
+            self._gradOutput = gradOutput.new()
         if self._sum is None:
-              self._sum = input.new()
+            self._sum = input.new()
 
         self.gradInput.copy_(input).div_(inputNorm)
         self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
@@ -107,13 +107,13 @@ class Cosine(Module):
         """
 
         if self._weight is None:
-              self._weight = self.weight.new()
+            self._weight = self.weight.new()
         if self._sum is None:
-              self._sum = input.new()
+            self._sum = input.new()
 
         self._weight.resize_as_(self.weight).copy_(self.weight)
         if self._gradOutput is None:
-              self._gradOutput = gradOutput.new()
+            self._gradOutput = gradOutput.new()
         self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
         self._gradOutput.mul_(self.output)
         torch.sum(self._gradOutput, 0, out=self._sum)
@@ -131,25 +131,23 @@ class Cosine(Module):
 
     def type(self, type=None, tensorCache=None):
         if type is not None:
-           # prevent premature memory allocations
-           self._input = None
-           self._weight = None
-           self._inputNorm = None
-           self._weightNorm = None
-           self._gradOutput = None
-           self._sum = None
+            # prevent premature memory allocations
+            self._input = None
+            self._weight = None
+            self._inputNorm = None
+            self._weightNorm = None
+            self._gradOutput = None
+            self._sum = None
 
         return super(Cosine, self).type(type, tensorCache)
 
-
     def clearState(self):
         clear(self, [
-           '_input',
-           '_weight',
-           '_gradOutput',
-           '_sum',
-           '_inputNorm',
-           '_weightNorm',
+            '_input',
+            '_weight',
+            '_gradOutput',
+            '_sum',
+            '_inputNorm',
+            '_weightNorm',
         ])
         return super(Cosine, self).clearState()
-
diff --git a/torch/legacy/nn/CosineDistance.py b/torch/legacy/nn/CosineDistance.py
index 0bea8ee341..b13b96408a 100644
--- a/torch/legacy/nn/CosineDistance.py
+++ b/torch/legacy/nn/CosineDistance.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class CosineDistance(Module):
 
     def __init__(self, ):
@@ -11,39 +12,38 @@ class CosineDistance(Module):
         self._input1 = None
         self._input2 = None
         self.buffer = None
-        self.w1  = None
+        self.w1 = None
         self.w22 = None
-        self.w  = None
+        self.w = None
         self.w32 = None
         self.ones = None
 
     def _makeContiguous(self, input1, input2):
         if not input1.is_contiguous():
-           if self._input1 is None:
-                  self._input1 = input1.new()
-           self._input1.resize_as_(input1).copy_(input1)
-           input1 = self._input1
+            if self._input1 is None:
+                self._input1 = input1.new()
+            self._input1.resize_as_(input1).copy_(input1)
+            input1 = self._input1
 
         if not input2.is_contiguous():
-           if self._input2 is None:
-                  self._input2 = input2.new()
-           self._input2.resize_as_(input2).copy_(input2)
-           input2 = self._input2
+            if self._input2 is None:
+                self._input2 = input2.new()
+            self._input2.resize_as_(input2).copy_(input2)
+            input2 = self._input2
 
         return input1, input2
 
-
     def updateOutput(self, input):
         input1, input2 = input[0], input[1]
         input1, input2 = self._makeContiguous(input1, input2)
 
         if self.buffer is None:
-           self.buffer = input1.new()
-           self.w1  = input1.new()
-           self.w22 = input1.new()
-           self.w  = input1.new()
-           self.w32 = input1.new()
-           self.ones = input1.new()
+            self.buffer = input1.new()
+            self.w1 = input1.new()
+            self.w22 = input1.new()
+            self.w = input1.new()
+            self.w32 = input1.new()
+            self.ones = input1.new()
 
         torch.mul(input1, input2, out=self.buffer)
         torch.sum(self.buffer, 1, out=self.w1)
@@ -65,18 +65,17 @@ class CosineDistance(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
-        v1  = input[0]
-        v2  = input[1]
+        v1 = input[0]
+        v2 = input[1]
         v1, v2 = self._makeContiguous(v1, v2)
 
         if len(self.gradInput) != 2:
-           if self.gradInput[0] is None:
-                  self.gradInput[0] = v1.new()
-           if self.gradInput[1] is None:
-                  self.gradInput[1] = v1.new()
-           self.gradInput = self.gradInput[:2]
+            if self.gradInput[0] is None:
+                self.gradInput[0] = v1.new()
+            if self.gradInput[1] is None:
+                self.gradInput[1] = v1.new()
+            self.gradInput = self.gradInput[:2]
 
         gw1 = self.gradInput[0]
         gw2 = self.gradInput[1]
@@ -97,15 +96,13 @@ class CosineDistance(Module):
 
         return self.gradInput
 
-
     def clearState(self):
         clear(self, [
-           'buffer',
-           'w1',
-           'w22',
-           'w',
-           'w32',
-           'ones',
+            'buffer',
+            'w1',
+            'w22',
+            'w',
+            'w32',
+            'ones',
         ])
         return super(CosineDistance, self).clearState()
-
diff --git a/torch/legacy/nn/CosineEmbeddingCriterion.py b/torch/legacy/nn/CosineEmbeddingCriterion.py
index cafbe41d76..271842faae 100644
--- a/torch/legacy/nn/CosineEmbeddingCriterion.py
+++ b/torch/legacy/nn/CosineEmbeddingCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class CosineEmbeddingCriterion(Criterion):
 
     def __init__(self, margin=0, sizeAverage=True):
@@ -9,23 +10,22 @@ class CosineEmbeddingCriterion(Criterion):
         self.sizeAverage = sizeAverage
         self.gradInput = [torch.Tensor(), torch.Tensor()]
         self.buffer = None
-        self.w1  = None
+        self.w1 = None
         self.w22 = None
-        self.w  = None
+        self.w = None
         self.w32 = None
         self._outputs = None
         self._idx = None
 
-
     def updateOutput(self, input, y):
         input1, input2 = input[0], input[1]
 
         # keep backward compatibility
         if self.buffer is None:
             self.buffer = input1.new()
-            self.w1  = input1.new()
+            self.w1 = input1.new()
             self.w22 = input1.new()
-            self.w  = input1.new()
+            self.w = input1.new()
             self.w32 = input1.new()
             self._outputs = input1.new()
 
@@ -64,14 +64,13 @@ class CosineEmbeddingCriterion(Criterion):
         self.output = self._outputs.sum()
 
         if self.sizeAverage:
-           self.output = self.output / y.size(0)
+            self.output = self.output / y.size(0)
 
         return self.output
 
-
     def updateGradInput(self, input, y):
-        v1  = input[0]
-        v2  = input[1]
+        v1 = input[0]
+        v2 = input[1]
 
         gw1 = self.gradInput[0]
         gw2 = self.gradInput[1]
@@ -98,22 +97,21 @@ class CosineEmbeddingCriterion(Criterion):
         gw2[self._idx] = gw2[self._idx].mul_(-1)
 
         if self.sizeAverage:
-           gw1.div_(y.size(0))
-           gw2.div_(y.size(0))
+            gw1.div_(y.size(0))
+            gw2.div_(y.size(0))
 
         return self.gradInput
 
     def type(self, type=None, tensorCache=None):
         if not type:
-           return self._type
+            return self._type
 
         self._idx = None
         super(CosineEmbeddingCriterion, self).type(type, tensorCache)
         # comparison operators behave differently from cuda/c implementations
         if type == 'torch.cuda.FloatTensor':
-           self._idx = torch.cuda.ByteTensor()
+            self._idx = torch.cuda.ByteTensor()
         else:
-           self._idx = torch.ByteTensor()
+            self._idx = torch.ByteTensor()
 
         return self
-
diff --git a/torch/legacy/nn/Criterion.py b/torch/legacy/nn/Criterion.py
index 4da81e6bfd..0c74470709 100644
--- a/torch/legacy/nn/Criterion.py
+++ b/torch/legacy/nn/Criterion.py
@@ -3,6 +3,7 @@ from .Module import Module
 from .utils import recursiveType
 import torch._thnn
 
+
 class Criterion(object):
 
     def __init__(self):
diff --git a/torch/legacy/nn/CriterionTable.py b/torch/legacy/nn/CriterionTable.py
index f7a6479342..7e20a8fb85 100644
--- a/torch/legacy/nn/CriterionTable.py
+++ b/torch/legacy/nn/CriterionTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class CriterionTable(Module):
 
     def __init__(self, criterion):
@@ -15,4 +16,3 @@ class CriterionTable(Module):
     def updateGradInput(self, input, grad_output):
         self.criterion.updateGradInput(*input)
         return self.gradInput
-
diff --git a/torch/legacy/nn/CrossEntropyCriterion.py b/torch/legacy/nn/CrossEntropyCriterion.py
index f6042cc14b..67e8b0d9ab 100644
--- a/torch/legacy/nn/CrossEntropyCriterion.py
+++ b/torch/legacy/nn/CrossEntropyCriterion.py
@@ -3,6 +3,7 @@ from .Criterion import Criterion
 from .LogSoftMax import LogSoftMax
 from .ClassNLLCriterion import ClassNLLCriterion
 
+
 class CrossEntropyCriterion(Criterion):
 
     def __init__(self, weights=None):
@@ -26,4 +27,3 @@ class CrossEntropyCriterion(Criterion):
         self.lsm.updateGradInput(input, self.nll.gradInput)
         self.gradInput = self.lsm.gradInput.view(size)
         return self.gradInput
-
diff --git a/torch/legacy/nn/DepthConcat.py b/torch/legacy/nn/DepthConcat.py
index b7abe9ad5e..19c31873ff 100644
--- a/torch/legacy/nn/DepthConcat.py
+++ b/torch/legacy/nn/DepthConcat.py
@@ -14,18 +14,19 @@ import math
 import torch
 from .Concat import Concat
 
+
 class DepthConcat(Concat):
 
     def windowNarrow(self, output, currentOutput, offset):
         outputWindow = output.narrow(self.dimension, offset, currentOutput.size(self.dimension))
         for dim in range(len(self.outputSize)):
-           currentSize = currentOutput.size(dim)
-           if dim != self.dimension and self.outputSize[dim] != currentSize:
-              # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
-              # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
-              # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
-              start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
-              outputWindow = outputWindow.narrow(dim, start, currentSize)
+            currentSize = currentOutput.size(dim)
+            if dim != self.dimension and self.outputSize[dim] != currentSize:
+                # 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
+                # 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
+                # 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
+                start = int(math.floor(((self.outputSize[dim] - currentSize) / 2)))
+                outputWindow = outputWindow.narrow(dim, start, currentSize)
         return outputWindow
 
     def updateOutput(self, input):
diff --git a/torch/legacy/nn/DistKLDivCriterion.py b/torch/legacy/nn/DistKLDivCriterion.py
index 984895f1de..074e1db1ff 100644
--- a/torch/legacy/nn/DistKLDivCriterion.py
+++ b/torch/legacy/nn/DistKLDivCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class DistKLDivCriterion(Criterion):
 
     def __init__(self, sizeAverage=True):
@@ -11,7 +12,7 @@ class DistKLDivCriterion(Criterion):
     def updateOutput(self, input, target):
         assert input.is_same_size(target)
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.DistKLDivCriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -32,4 +33,3 @@ class DistKLDivCriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/DotProduct.py b/torch/legacy/nn/DotProduct.py
index 6584cc59ba..c91d5c738a 100644
--- a/torch/legacy/nn/DotProduct.py
+++ b/torch/legacy/nn/DotProduct.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class DotProduct(Module):
 
     def __init__(self):
@@ -13,7 +14,7 @@ class DotProduct(Module):
         input1, input2 = input[0], input[1]
 
         if self.buffer is None:
-           self.buffer = input1.new()
+            self.buffer = input1.new()
 
         torch.mul(input1, input2, out=self.buffer)
         torch.sum(self.buffer, 1, out=self.output)
@@ -26,11 +27,11 @@ class DotProduct(Module):
         not_batch = False
 
         if len(self.gradInput) != 2:
-          if self.gradInput[0] is None:
-              self.gradInput[0] = input[0].new()
-          if self.gradInput[1] is None:
-              self.gradInput[1] = input[1].new()
-          self.gradInput = self.gradInput[:2]
+            if self.gradInput[0] is None:
+                self.gradInput[0] = input[0].new()
+            if self.gradInput[1] is None:
+                self.gradInput[1] = input[1].new()
+            self.gradInput = self.gradInput[:2]
 
         gw1 = self.gradInput[0]
         gw2 = self.gradInput[1]
@@ -46,4 +47,3 @@ class DotProduct(Module):
     def clearState(self):
         clear(self, 'buffer')
         return super(DotProduct, self).clearState()
-
diff --git a/torch/legacy/nn/Dropout.py b/torch/legacy/nn/Dropout.py
index b1be2c8d8a..41330e503b 100644
--- a/torch/legacy/nn/Dropout.py
+++ b/torch/legacy/nn/Dropout.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Dropout(Module):
 
     def __init__(self, p=0.5, inplace=False):
@@ -19,8 +20,8 @@ class Dropout(Module):
 
         if self.p > 0 and self.train:
             self.noise.resize_as_(input)
-            self.noise.bernoulli_(1-self.p)
-            self.noise.div_(1-self.p)
+            self.noise.bernoulli_(1 - self.p)
+            self.noise.div_(1 - self.p)
             self.output.mul_(self.noise)
 
         return self.output
@@ -32,7 +33,7 @@ class Dropout(Module):
             self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
 
         if self.p > 0 and self.train:
-            self.gradInput.mul_(self.noise) # simply mask the gradients with the noise vector
+            self.gradInput.mul_(self.noise)  # simply mask the gradients with the noise vector
 
         return self.gradInput
 
@@ -45,4 +46,3 @@ class Dropout(Module):
     def clearState(self):
         clear(self, 'noise')
         return super(Dropout, self).clearState()
-
diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py
index b84562cd19..56e3a85352 100644
--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@@ -2,6 +2,7 @@
 import torch
 from .Module import Module
 
+
 class ELU(Module):
     """
             Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
@@ -39,4 +40,3 @@ class ELU(Module):
 
     def __repr__(self):
         return '{}(alpha={:.3f})'.format(str(type(self)), self.alpha)
-
diff --git a/torch/legacy/nn/Euclidean.py b/torch/legacy/nn/Euclidean.py
index b52b077492..d529982eaa 100644
--- a/torch/legacy/nn/Euclidean.py
+++ b/torch/legacy/nn/Euclidean.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Euclidean(Module):
 
     def __init__(self, inputSize, outputSize):
@@ -18,11 +19,11 @@ class Euclidean(Module):
         self.fastBackward = True
         self.reset()
 
-        self._input   = None
-        self._weight  = None
-        self._expand  = None
+        self._input = None
+        self._weight = None
+        self._expand = None
         self._expand2 = None
-        self._repeat  = None
+        self._repeat = None
         self._repeat2 = None
         self._div = None
         self._output = None
@@ -32,32 +33,32 @@ class Euclidean(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1./math.sqrt(self.weight.size(0))
+            stdv = 1. / math.sqrt(self.weight.size(0))
 
         self.weight.uniform_(-stdv, stdv)
 
     def _view(self, res, src, *args):
         if src.is_contiguous():
-           res.set_(src.view(*args))
+            res.set_(src.view(*args))
         else:
-           res.set_(src.contiguous().view(*args))
+            res.set_(src.contiguous().view(*args))
 
     def updateOutput(self, input):
         # lazy initialize buffers
         if self._input is None:
-              self._input = input.new()
+            self._input = input.new()
         if self._weight is None:
-              self._weight = self.weight.new()
+            self._weight = self.weight.new()
         if self._expand is None:
-              self._expand = self.output.new()
+            self._expand = self.output.new()
         if self._expand2 is None:
-              self._expand2 = self.output.new()
+            self._expand2 = self.output.new()
         if self._repeat is None:
-              self._repeat = self.output.new()
+            self._repeat = self.output.new()
         if self._repeat2 is None:
-              self._repeat2 = self.output.new()
+            self._repeat2 = self.output.new()
 
         inputSize, outputSize = self.weight.size(0), self.weight.size(1)
 
@@ -88,19 +89,19 @@ class Euclidean(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
-           return
+            return
 
         if self._div is None:
-              self._div = input.new()
+            self._div = input.new()
         if self._output is None:
-              self._output = self.output.new()
+            self._output = self.output.new()
         if self._gradOutput is None:
-              self._gradOutput = input.new()
+            self._gradOutput = input.new()
         if self._expand3 is None:
-              self._expand3 = input.new()
+            self._expand3 = input.new()
 
         if not self.fastBackward:
-           self.updateOutput(input)
+            self.updateOutput(input)
 
         inputSize, outputSize = self.weight.size(0), self.weight.size(1)
 
@@ -126,13 +127,11 @@ class Euclidean(Module):
         else:
             torch.mul(self._repeat, self._expand3, out=self._repeat2)
 
-
         torch.sum(self._repeat2, 2, out=self.gradInput)
         self.gradInput.resize_as_(input)
 
         return self.gradInput
 
-
     def accGradParameters(self, input, gradOutput, scale=1):
         inputSize, outputSize = self.weight.size(0), self.weight.size(1)
 
@@ -144,32 +143,30 @@ class Euclidean(Module):
         # assumes a preceding call to updateGradInput
         assert input.dim() == 2
         if self._sum is None:
-              self._sum = input.new()
+            self._sum = input.new()
         torch.sum(self._repeat2, 0, out=self._sum)
         self._sum.resize_(inputSize, outputSize)
         self.gradWeight.add_(-scale, self._sum)
 
     def type(self, type=None, tensorCache=None):
         if type:
-           # prevent premature memory allocations
-           self.clearState()
+            # prevent premature memory allocations
+            self.clearState()
 
         return super(Euclidean, self).type(type, tensorCache)
 
-
     def clearState(self):
         clear(self, [
-           '_input',
-           '_output',
-           '_gradOutput',
-           '_weight',
-           '_div',
-           '_sum',
-           '_expand',
-           '_expand2',
-           '_expand3',
-           '_repeat',
-           '_repeat2',
+            '_input',
+            '_output',
+            '_gradOutput',
+            '_weight',
+            '_div',
+            '_sum',
+            '_expand',
+            '_expand2',
+            '_expand3',
+            '_repeat',
+            '_repeat2',
         ])
         return super(Euclidean, self).clearState()
-
diff --git a/torch/legacy/nn/Exp.py b/torch/legacy/nn/Exp.py
index 97cd9c4e57..7156a99eb9 100644
--- a/torch/legacy/nn/Exp.py
+++ b/torch/legacy/nn/Exp.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Exp(Module):
 
     def updateOutput(self, input):
@@ -8,4 +9,3 @@ class Exp(Module):
 
     def updateGradInput(self, input, gradOutput):
         return torch.mul(self.output, gradOutput, out=self.gradInput)
-
diff --git a/torch/legacy/nn/FlattenTable.py b/torch/legacy/nn/FlattenTable.py
index dbb1950abb..1468f0cedd 100644
--- a/torch/legacy/nn/FlattenTable.py
+++ b/torch/legacy/nn/FlattenTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class FlattenTable(Module):
 
     def __init__(self):
@@ -59,7 +60,6 @@ class FlattenTable(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         assert isinstance(input, list)
         assert isinstance(gradOutput, list)
@@ -69,11 +69,10 @@ class FlattenTable(Module):
 
         # However, we should check that the gradInput is valid:
         if not self._checkMapping(gradOutput, self.gradInput, self.input_map):
-                self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
+            self.gradInput = self._inverseFlatten(gradOutput, self.input_map)
 
         return self.gradInput
 
-
     def type(self, type=None, tensorCache=None):
         if not type:
             return self._type
@@ -81,8 +80,6 @@ class FlattenTable(Module):
         # conversions. Just force the tables to be empty.
         self.clearState()
 
-
     def clearState(self):
         self.input_map = []
         return super(FlattenTable, self).clearState()
-
diff --git a/torch/legacy/nn/GradientReversal.py b/torch/legacy/nn/GradientReversal.py
index 033d60a448..36c048b3b0 100644
--- a/torch/legacy/nn/GradientReversal.py
+++ b/torch/legacy/nn/GradientReversal.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class GradientReversal(Module):
 
     def __init__(self, lambd=1):
@@ -19,4 +20,3 @@ class GradientReversal(Module):
         self.gradInput.copy_(gradOutput)
         self.gradInput.mul_(-self.lambd)
         return self.gradInput
-
diff --git a/torch/legacy/nn/HardShrink.py b/torch/legacy/nn/HardShrink.py
index 4015ab8030..99b3bb2292 100644
--- a/torch/legacy/nn/HardShrink.py
+++ b/torch/legacy/nn/HardShrink.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class HardShrink(Module):
 
     def __init__(self, lambd=0.5):
@@ -26,4 +27,3 @@ class HardShrink(Module):
             self.lambd
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/HardTanh.py b/torch/legacy/nn/HardTanh.py
index d7ce767bea..b8bae62f9b 100644
--- a/torch/legacy/nn/HardTanh.py
+++ b/torch/legacy/nn/HardTanh.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class HardTanh(Module):
 
     def __init__(self, min_value=-1, max_value=1, inplace=False):
@@ -32,4 +33,3 @@ class HardTanh(Module):
             self.inplace
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/HingeEmbeddingCriterion.py b/torch/legacy/nn/HingeEmbeddingCriterion.py
index fb36467c9d..d94bd14bbd 100644
--- a/torch/legacy/nn/HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/HingeEmbeddingCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class HingeEmbeddingCriterion(Criterion):
 
     def __init__(self, margin=1, sizeAverage=True):
@@ -11,7 +12,7 @@ class HingeEmbeddingCriterion(Criterion):
 
     def updateOutput(self, input, y):
         if self.buffer is None:
-              self.buffer = input.new()
+            self.buffer = input.new()
         self.buffer.resize_as_(input).copy_(input)
         self.buffer[torch.eq(y, -1.)] = 0
         self.output = self.buffer.sum()
@@ -34,4 +35,3 @@ class HingeEmbeddingCriterion(Criterion):
             self.gradInput.mul_(1. / input.nelement())
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/Identity.py b/torch/legacy/nn/Identity.py
index ed9f302f28..09c7e1b8e8 100644
--- a/torch/legacy/nn/Identity.py
+++ b/torch/legacy/nn/Identity.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Identity(Module):
 
     def updateOutput(self, input):
@@ -14,8 +15,7 @@ class Identity(Module):
 
     def clearState(self):
         clear(self, [
-           'output',
-           'gradInput',
+            'output',
+            'gradInput',
         ])
         return super(Identity, self).clearState()
-
diff --git a/torch/legacy/nn/Index.py b/torch/legacy/nn/Index.py
index 0a792d6e73..e88c454bff 100644
--- a/torch/legacy/nn/Index.py
+++ b/torch/legacy/nn/Index.py
@@ -1,25 +1,25 @@
 import torch
 from .Module import Module
 
+
 class Index(Module):
 
     def __init__(self, dimension):
-         super(Index, self).__init__()
-         self.dimension = dimension
-         self.gradInput = [self.gradInput]
+        super(Index, self).__init__()
+        self.dimension = dimension
+        self.gradInput = [self.gradInput]
 
     def updateOutput(self, input):
-         t = input[0]
-         index = input[1]
-         torch.index_select(t, self.dimension, index, out=self.output)
-         return self.output
+        t = input[0]
+        index = input[1]
+        torch.index_select(t, self.dimension, index, out=self.output)
+        return self.output
 
     def updateGradInput(self, input, gradOutput):
-         t = input[0]
-         index = input[1]
-
-         gradInput = self.gradInput[0]  # no gradient for the index variable
-         gradInput.resize_as_(t).zero_()
-         gradInput.index_add_(self.dimension, index, gradOutput)
-         return self.gradInput
+        t = input[0]
+        index = input[1]
 
+        gradInput = self.gradInput[0]  # no gradient for the index variable
+        gradInput.resize_as_(t).zero_()
+        gradInput.index_add_(self.dimension, index, gradOutput)
+        return self.gradInput
diff --git a/torch/legacy/nn/JoinTable.py b/torch/legacy/nn/JoinTable.py
index e0df22f9e1..0031945d08 100644
--- a/torch/legacy/nn/JoinTable.py
+++ b/torch/legacy/nn/JoinTable.py
@@ -43,7 +43,7 @@ class JoinTable(Module):
         dim = self._getPositiveDimension(input)
 
         for i in range(len(input)):
-            if len(self.gradInput) < i+1:
+            if len(self.gradInput) < i + 1:
                 self.gradInput.append(input[i].new())
             self.gradInput[i].resize_as_(input[i])
         self.gradInput = self.gradInput[:len(input)]
diff --git a/torch/legacy/nn/L1Cost.py b/torch/legacy/nn/L1Cost.py
index 60f0096b83..1a41588f94 100644
--- a/torch/legacy/nn/L1Cost.py
+++ b/torch/legacy/nn/L1Cost.py
@@ -2,6 +2,7 @@ import torch
 from .Criterion import Criterion
 from .utils import clear
 
+
 class L1Cost(Criterion):
 
     def __init__(self):
@@ -11,7 +12,7 @@ class L1Cost(Criterion):
     def updateOutput(self, input, target=None):
         assert target is None
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.L1Cost_updateOutput(
             self._backend.library_state,
             input,
@@ -33,4 +34,3 @@ class L1Cost(Criterion):
     def clearState(self):
         clear(self, 'output_tensor')
         return super(L1Cost, self).clearState()
-
diff --git a/torch/legacy/nn/L1HingeEmbeddingCriterion.py b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
index 8c985a0610..c414934221 100644
--- a/torch/legacy/nn/L1HingeEmbeddingCriterion.py
+++ b/torch/legacy/nn/L1HingeEmbeddingCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class L1HingeEmbeddingCriterion(Criterion):
 
     def __init__(self, margin=1):
@@ -9,9 +10,9 @@ class L1HingeEmbeddingCriterion(Criterion):
         self.gradInput = [torch.Tensor(), torch.Tensor()]
 
     def updateOutput(self, input, y):
-        self.output = input[0].dist(input[1], 1);
+        self.output = input[0].dist(input[1], 1)
         if y == -1:
-            self.output = max(0, self.margin - self.output);
+            self.output = max(0, self.margin - self.output)
 
         return self.output
 
@@ -33,4 +34,3 @@ class L1HingeEmbeddingCriterion(Criterion):
 
         self.gradInput[1].zero_().add_(-1, self.gradInput[0])
         return self.gradInput
-
diff --git a/torch/legacy/nn/L1Penalty.py b/torch/legacy/nn/L1Penalty.py
index 8f2af01276..05472d75f6 100644
--- a/torch/legacy/nn/L1Penalty.py
+++ b/torch/legacy/nn/L1Penalty.py
@@ -5,6 +5,7 @@ from .Module import Module
 # [gradOutput] to the gradient of the L1 loss. The [input] is copied to
 # the [output].
 
+
 class L1Penalty(Module):
 
     def __init__(self, l1weight, sizeAverage=False, provideOutput=True):
@@ -34,4 +35,3 @@ class L1Penalty(Module):
             self.gradInput.add_(gradOutput)
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/LeakyReLU.py b/torch/legacy/nn/LeakyReLU.py
index c3175946ca..ca3a5cc6b2 100644
--- a/torch/legacy/nn/LeakyReLU.py
+++ b/torch/legacy/nn/LeakyReLU.py
@@ -1,15 +1,16 @@
 import torch
 from .Module import Module
 
+
 class LeakyReLU(Module):
 
-    def __init__(self, negval=1/100, inplace=False):
+    def __init__(self, negval=1 / 100, inplace=False):
         super(LeakyReLU, self).__init__()
         if isinstance(negval, bool):
-           inplace = negval
-           self.negval = 1/100
+            inplace = negval
+            self.negval = 1 / 100
         else:
-           self.negval = negval
+            self.negval = negval
 
         # default for inplace is False
         self.inplace = inplace
@@ -27,7 +28,6 @@ class LeakyReLU(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self._backend.LeakyReLU_updateGradInput(
             self._backend.library_state,
@@ -39,7 +39,5 @@ class LeakyReLU(Module):
         )
         return self.gradInput
 
-
     def __repr__(self):
         return str(type(self)) + '({:.4f})'.format(self.negval)
-
diff --git a/torch/legacy/nn/Linear.py b/torch/legacy/nn/Linear.py
index 1f27ff5158..eb69a63e70 100644
--- a/torch/legacy/nn/Linear.py
+++ b/torch/legacy/nn/Linear.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Linear(Module):
 
     def __init__(self, inputSize, outputSize, bias=True):
@@ -24,7 +25,7 @@ class Linear(Module):
         if stdv is not None:
             stdv = stdv * math.sqrt(3)
         else:
-            stdv = 1./math.sqrt(self.weight.size(1))
+            stdv = 1. / math.sqrt(self.weight.size(1))
 
         self.weight.uniform_(-stdv, stdv)
         if self.bias is not None:
@@ -35,7 +36,7 @@ class Linear(Module):
     def _updateAddBuffer(self, input):
         nframe = input.size(0)
         if self.addBuffer is None:
-              self.addBuffer = input.new()
+            self.addBuffer = input.new()
         if self.addBuffer.nelement() != nframe:
             self.addBuffer.resize_(nframe).fill_(1)
 
@@ -80,9 +81,7 @@ class Linear(Module):
         clear(self, 'addBuffer')
         return super(Linear, self).clearState()
 
-
     def __repr__(self):
         return super(Linear, self).__repr__() + \
-                '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
-                (' without bias' if self.bias is None else '')
-
+            '({} -> {})'.format(self.weight.size(1), self.weight.size(0)) + \
+            (' without bias' if self.bias is None else '')
diff --git a/torch/legacy/nn/Log.py b/torch/legacy/nn/Log.py
index a036688905..1f5e4bd206 100644
--- a/torch/legacy/nn/Log.py
+++ b/torch/legacy/nn/Log.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Log(Module):
 
     def updateOutput(self, input):
@@ -9,10 +10,9 @@ class Log(Module):
         self.output.log_()
         return self.output
 
-    def updateGradInput(self, input, gradOutput) :
+    def updateGradInput(self, input, gradOutput):
         self.gradInput.resize_as_(input)
         self.gradInput.fill_(1)
         self.gradInput.div_(input)
         self.gradInput.mul_(gradOutput)
         return self.gradInput
-
diff --git a/torch/legacy/nn/LogSigmoid.py b/torch/legacy/nn/LogSigmoid.py
index 3373b83725..d6b8761729 100644
--- a/torch/legacy/nn/LogSigmoid.py
+++ b/torch/legacy/nn/LogSigmoid.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class LogSigmoid(Module):
 
     def __init__(self):
@@ -10,7 +11,7 @@ class LogSigmoid(Module):
 
     def updateOutput(self, input):
         if self.buffer is None:
-              self.buffer = input.new()
+            self.buffer = input.new()
         self._backend.LogSigmoid_updateOutput(
             self._backend.library_state,
             input,
@@ -32,5 +33,3 @@ class LogSigmoid(Module):
     def clearState(self):
         clear(self, 'buffer')
         return super(LogSigmoid, self).clearState()
-
-
diff --git a/torch/legacy/nn/LogSoftMax.py b/torch/legacy/nn/LogSoftMax.py
index b10f483a1f..948e9512ac 100644
--- a/torch/legacy/nn/LogSoftMax.py
+++ b/torch/legacy/nn/LogSoftMax.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class LogSoftMax(Module):
 
     def updateOutput(self, input):
@@ -11,7 +12,6 @@ class LogSoftMax(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self._backend.LogSoftMax_updateGradInput(
             self._backend.library_state,
@@ -21,4 +21,3 @@ class LogSoftMax(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/LookupTable.py b/torch/legacy/nn/LookupTable.py
index 6413521cc9..dc610b7444 100644
--- a/torch/legacy/nn/LookupTable.py
+++ b/torch/legacy/nn/LookupTable.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class LookupTable(Module):
 
     def __init__(self, nIndex, nOutput, paddingValue=-1, maxNorm=None, normType=None):
@@ -59,16 +60,15 @@ class LookupTable(Module):
         self.renorm(input)
         input = self._makeInputContiguous(input)
         if input.dim() == 1:
-           torch.index_select(self.weight, 0, input, out=self.output)
+            torch.index_select(self.weight, 0, input, out=self.output)
         elif input.dim() == 2:
-           torch.index_select(self.weight, 0, input.view(-1), out=self.output)
-           self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
+            torch.index_select(self.weight, 0, input.view(-1), out=self.output)
+            self.output = self.output.view(input.size(0), input.size(1), self.weight.size(1))
         else:
-           raise RuntimeError("input must be a vector or matrix")
+            raise RuntimeError("input must be a vector or matrix")
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         # the input can be of any type (as in the forward it's
         # converted anyway to LongTensor) thus, need to allocate
@@ -81,7 +81,6 @@ class LookupTable(Module):
 
         return self.gradInput
 
-
     def accGradParameters(self, input, gradOutput, scale=1):
         input = self._input if self.copiedInput else input
         if input.dim() == 2:
@@ -110,16 +109,16 @@ class LookupTable(Module):
 
     def renorm(self, input):
         if self.maxNorm is None:
-           return
+            return
 
         # copy input into _input, so _input is continous.
         # The copied _input will be modified in the C code.
         self._input.resize_(input.size()).copy_(input)
         row_idx = self._input
         if row_idx.dim() == 2:
-           row_idx = row_idx.view(-1)
+            row_idx = row_idx.view(-1)
         elif row_idx.dim() != 1:
-           raise RuntimeError("input must be a vector or matrix")
+            raise RuntimeError("input must be a vector or matrix")
 
         # "row_idx" and "weight" will be modified in the C code
         self._backend.LookupTable_renorm(
@@ -151,4 +150,3 @@ class LookupTable(Module):
     def clearState(self):
         clear(self, '_count', '_input', '_sorted', '_indices', '_gradOutput')
         return super(LookupTable, self).clearState()
-
diff --git a/torch/legacy/nn/MM.py b/torch/legacy/nn/MM.py
index 35077edc15..30b0bc6d40 100644
--- a/torch/legacy/nn/MM.py
+++ b/torch/legacy/nn/MM.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class MM(Module):
 
     def __init__(self, transA=False, transB=False):
@@ -35,9 +36,9 @@ class MM(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput[0] is None:
-              self.gradInput[0] = input[0].new()
+            self.gradInput[0] = input[0].new()
         if self.gradInput[1] is None:
-              self.gradInput[1] = input[1].new()
+            self.gradInput[1] = input[1].new()
 
         assert len(input) == 2
         a, b = input
@@ -69,4 +70,3 @@ class MM(Module):
             getattr(torch, f)(a, gradOutput, out=self.gradInput[1])
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/MSECriterion.py b/torch/legacy/nn/MSECriterion.py
index 5897d32ee1..05b3ee2a4e 100644
--- a/torch/legacy/nn/MSECriterion.py
+++ b/torch/legacy/nn/MSECriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class MSECriterion(Criterion):
 
     def __init__(self, sizeAverage=True):
@@ -10,7 +11,7 @@ class MSECriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.MSECriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -30,4 +31,3 @@ class MSECriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/MV.py b/torch/legacy/nn/MV.py
index ff87422083..bebe1b8578 100644
--- a/torch/legacy/nn/MV.py
+++ b/torch/legacy/nn/MV.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class MV(Module):
     """Module to perform matrix vector multiplication on two minibatch inputs,
        producing a minibatch.
@@ -63,4 +64,3 @@ class MV(Module):
                 self.gradInput[1] = M.t() * gradOutput
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/MarginCriterion.py b/torch/legacy/nn/MarginCriterion.py
index 628ca2a36f..23d3aed3d6 100644
--- a/torch/legacy/nn/MarginCriterion.py
+++ b/torch/legacy/nn/MarginCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class MarginCriterion(Criterion):
 
     def __init__(self, margin=1, sizeAverage=True):
@@ -11,7 +12,7 @@ class MarginCriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.MarginCriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -33,4 +34,3 @@ class MarginCriterion(Criterion):
             self.margin
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/MarginRankingCriterion.py b/torch/legacy/nn/MarginRankingCriterion.py
index 31de4660cb..b68c7444f3 100644
--- a/torch/legacy/nn/MarginRankingCriterion.py
+++ b/torch/legacy/nn/MarginRankingCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class MarginRankingCriterion(Criterion):
 
     def __init__(self, margin=1, sizeAverage=True):
@@ -15,29 +16,29 @@ class MarginRankingCriterion(Criterion):
 
     def updateOutput(self, input, y):
         if input[0].size(0) == 1:
-           self.output = max(0, -y*(input[0][0]-input[1][0]) + self.margin)
+            self.output = max(0, -y * (input[0][0] - input[1][0]) + self.margin)
         else:
-           if self._output is None:
-                  self._output = input[0].clone()
-           self._output.resize_as_(input[0])
-           self._output.copy_(input[0])
+            if self._output is None:
+                self._output = input[0].clone()
+            self._output.resize_as_(input[0])
+            self._output.copy_(input[0])
 
-           self._output.add_(-1, input[1])
-           self._output.mul_(-1).mul_(y)
-           self._output.add_(self.margin)
+            self._output.add_(-1, input[1])
+            self._output.mul_(-1).mul_(y)
+            self._output.add_(self.margin)
 
-           self._output.clamp_(min=0)
+            self._output.clamp_(min=0)
 
-           self.output = self._output.sum()
+            self.output = self._output.sum()
 
-           if self.sizeAverage:
-              self.output = self.output / y.size(0)
+            if self.sizeAverage:
+                self.output = self.output / y.size(0)
 
         return self.output
 
     def updateGradInput(self, input, y):
         if input[0].size(0) == 1:
-            dist = -y * (input[0][0]-input[1][0]) + self.margin
+            dist = -y * (input[0][0] - input[1][0]) + self.margin
             if dist < 0:
                 self.gradInput[0][0] = 0
                 self.gradInput[1][0] = 0
@@ -46,7 +47,7 @@ class MarginRankingCriterion(Criterion):
                 self.gradInput[1][0] = y
         else:
             if self.dist is None:
-                  self.dist = input[0].new()
+                self.dist = input[0].new()
             self.dist = self.dist.resize_as_(input[0]).copy_(input[0])
             dist = self.dist
 
@@ -55,7 +56,7 @@ class MarginRankingCriterion(Criterion):
             dist.add_(self.margin)
 
             if self.mask is None:
-                  self.mask = input[0].new()
+                self.mask = input[0].new()
             self.mask = self.mask.resize_as_(input[0]).copy_(dist)
             mask = self.mask
 
@@ -74,4 +75,3 @@ class MarginRankingCriterion(Criterion):
                 self.gradInput[1].div_(y.size(0))
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/MaskedSelect.py b/torch/legacy/nn/MaskedSelect.py
index c1abfff4d8..39be82d70e 100644
--- a/torch/legacy/nn/MaskedSelect.py
+++ b/torch/legacy/nn/MaskedSelect.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class MaskedSelect(Module):
 
     def __init__(self):
@@ -20,10 +21,10 @@ class MaskedSelect(Module):
     def updateGradInput(self, input, gradOutput):
         input, mask = input
         if input.type() == 'torch.cuda.FloatTensor':
-            torch.range(0, mask.nelement()-1, out=self._maskIndexBufferCPU).resize_(mask.size())
+            torch.range(0, mask.nelement() - 1, out=self._maskIndexBufferCPU).resize_(mask.size())
             self._maskIndexBuffer.resize_(self._maskIndexBufferCPU.size()).copy_(self._maskIndexBufferCPU)
         else:
-            torch.range(0, mask.nelement()-1, out=self._maskIndexBuffer).resize_(mask.size())
+            torch.range(0, mask.nelement() - 1, out=self._maskIndexBuffer).resize_(mask.size())
 
         torch.masked_select(self._maskIndexBuffer, mask, out=self._maskIndices)
         self._gradBuffer.resize_(input.nelement()).zero_()
@@ -42,13 +43,13 @@ class MaskedSelect(Module):
 
         # These casts apply when switching between cuda/non-cuda types
         if type != 'torch.cuda.FloatTensor':
-                self._maskIndexBuffer = self._maskIndexBuffer.long()
-                self._maskIndices = self._maskIndices.long()
-                self._gradMask = self._gradMask.byte()
+            self._maskIndexBuffer = self._maskIndexBuffer.long()
+            self._maskIndices = self._maskIndices.long()
+            self._gradMask = self._gradMask.byte()
         else:
-                self._maskIndexBuffer = self._maskIndexBuffer.cuda()
-                self._maskIndices = self._maskIndices.cuda()
-                self._gradMask = self._gradMask.cuda()
+            self._maskIndexBuffer = self._maskIndexBuffer.cuda()
+            self._maskIndices = self._maskIndices.cuda()
+            self._gradMask = self._gradMask.cuda()
 
         self._type = type
         return self
@@ -61,4 +62,3 @@ class MaskedSelect(Module):
                             '_maskIndices',
                             '_gradBuffer',
                             '_gradMask'])
-
diff --git a/torch/legacy/nn/Max.py b/torch/legacy/nn/Max.py
index 615532f0f7..eab9bcae02 100644
--- a/torch/legacy/nn/Max.py
+++ b/torch/legacy/nn/Max.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear, addSingletondimension
 
+
 class Max(Module):
 
     def __init__(self, dimension=0):
@@ -13,25 +14,25 @@ class Max(Module):
     def _getPositiveDimension(self, input):
         dimension = self.dimension
         if dimension < 0:
-           dimension = input.dim() + dimension
+            dimension = input.dim() + dimension
 
         return dimension
 
     def _lazyInit(self):
         if self._output is None:
-              self._output = self.output.new()
+            self._output = self.output.new()
         if self._indices is None:
-              self._indices = \
-           (torch.cuda.LongTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' else torch.LongTensor())
+            self._indices = \
+                (torch.cuda.LongTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' else torch.LongTensor())
 
     def updateOutput(self, input):
         self._lazyInit()
         dimension = self._getPositiveDimension(input)
         torch.max(input, dimension, out=(self._output, self._indices))
         if input.dim() > 1:
-          self.output.set_(self._output.select(dimension, 0))
+            self.output.set_(self._output.select(dimension, 0))
         else:
-          self.output.set_(self._output)
+            self.output.set_(self._output)
 
         return self.output
 
@@ -39,9 +40,9 @@ class Max(Module):
         self._lazyInit()
         dimension = self._getPositiveDimension(input)
         if input.dim() > 1:
-          gradOutputView = addSingletondimension(gradOutput, dimension)
+            gradOutputView = addSingletondimension(gradOutput, dimension)
         else:
-          gradOutputView = gradOutput
+            gradOutputView = gradOutput
 
         self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
         return self.gradInput
@@ -64,4 +65,3 @@ class Max(Module):
     def clearState(self):
         clear(self, '_indices', '_output')
         return super(Max, self).clearState()
-
diff --git a/torch/legacy/nn/Mean.py b/torch/legacy/nn/Mean.py
index 905d91dbbf..67048d2aa6 100644
--- a/torch/legacy/nn/Mean.py
+++ b/torch/legacy/nn/Mean.py
@@ -9,8 +9,8 @@ Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
 
 """
 
+
 class Mean(Sum):
 
     def __init__(self, dimension):
         super(Mean, self).__init__(dimension, True)
-
diff --git a/torch/legacy/nn/Min.py b/torch/legacy/nn/Min.py
index 89809ea984..88967a09d0 100644
--- a/torch/legacy/nn/Min.py
+++ b/torch/legacy/nn/Min.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear, addSingletondimension
 
+
 class Min(Module):
 
     def __init__(self, dimension=0):
@@ -13,25 +14,25 @@ class Min(Module):
     def _getPositiveDimension(self, input):
         dimension = self.dimension
         if dimension < 0:
-           dimension = input.dim() + dimension
+            dimension = input.dim() + dimension
 
         return dimension
 
     def _lazyInit(self):
         if self._output is None:
-              self._output = self.output.new()
+            self._output = self.output.new()
         if self._indices is None:
-              self._indices = \
-           (torch.cuda.LongTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' else torch.LongTensor())
+            self._indices = \
+                (torch.cuda.LongTensor() if torch.typename(self.output) == 'torch.cuda.FloatTensor' else torch.LongTensor())
 
     def updateOutput(self, input):
         self._lazyInit()
         dimension = self._getPositiveDimension(input)
         torch.min(input, dimension, out=(self._output, self._indices))
         if input.dim() > 1:
-          self.output.set_(self._output.select(dimension, 0))
+            self.output.set_(self._output.select(dimension, 0))
         else:
-          self.output.set_(self._output)
+            self.output.set_(self._output)
 
         return self.output
 
@@ -39,9 +40,9 @@ class Min(Module):
         self._lazyInit()
         dimension = self._getPositiveDimension(input)
         if input.dim() > 1:
-          gradOutputView = addSingletondimension(gradOutput, dimension)
+            gradOutputView = addSingletondimension(gradOutput, dimension)
         else:
-          gradOutputView = gradOutput
+            gradOutputView = gradOutput
 
         self.gradInput.resize_as_(input).zero_().scatter_(dimension, self._indices, gradOutputView)
         return self.gradInput
@@ -64,4 +65,3 @@ class Min(Module):
     def clearState(self):
         clear(self, '_indices', '_output')
         return super(Min, self).clearState()
-
diff --git a/torch/legacy/nn/MixtureTable.py b/torch/legacy/nn/MixtureTable.py
index b70df50619..12fc3ea830 100644
--- a/torch/legacy/nn/MixtureTable.py
+++ b/torch/legacy/nn/MixtureTable.py
@@ -27,11 +27,11 @@ class MixtureTable(Module):
 
         # buffers
         if self._gaterView is None:
-              self._gaterView = input[0].new()
+            self._gaterView = input[0].new()
         if self._expert is None:
-              self._expert = input[0].new()
+            self._expert = input[0].new()
         if self._expertView is None:
-              self._expertView = input[0].new()
+            self._expertView = input[0].new()
 
         self.dimG = 1
         batchSize = gaterInput.size(0)
@@ -43,7 +43,7 @@ class MixtureTable(Module):
 
             expertInput = expertInputs[0]
             if self.batchSize != batchSize:
-                size = [1] * (expertInput.dim()+1)
+                size = [1] * (expertInput.dim() + 1)
                 if self.dimG > 0:
                     size[0] = gaterInput.size(0)
                 size[self.dim] = gaterInput.size(self.dimG)
@@ -83,11 +83,11 @@ class MixtureTable(Module):
 
         # buffers
         if self._sum is None:
-              self._sum = input[0].new()
+            self._sum = input[0].new()
         if self._expertView2 is None:
-              self._expertView2 = input[0].new()
+            self._expertView2 = input[0].new()
         if self._expert2 is None:
-              self._expert2 = input[0].new()
+            self._expert2 = input[0].new()
 
         if self.table:
             if not self.backwardSetup:
@@ -99,7 +99,6 @@ class MixtureTable(Module):
                 gaterGradInput.resize_as_(gaterInput)
                 self.backwardSetup = True
 
-
             # like CMulTable, but with broadcasting
             for i, expertGradInput in enumerate(expertGradInputs):
                 # gater updateGradInput
@@ -140,7 +139,7 @@ class MixtureTable(Module):
             else:
                 self._expertView2 = expert.view(gaterInput.size(0), gaterInput.size(1), -1)
 
-            torch.sum(self._expertView2, self.dimG+1, out=gaterGradInput)
+            torch.sum(self._expertView2, self.dimG + 1, out=gaterGradInput)
             gaterGradInput.resize_as_(gaterInput)
 
             # expert updateGradInput
@@ -159,11 +158,11 @@ class MixtureTable(Module):
 
     def clearState(self, ):
         clear(self, [
-          '_gaterView',
-          '_expert',
-          '_expertView',
-          '_sum',
-          '_expert2',
-          '_expertView2',
+            '_gaterView',
+            '_expert',
+            '_expertView',
+            '_sum',
+            '_expert2',
+            '_expertView2',
         ])
         return super(MixtureTable, self).clearState()
diff --git a/torch/legacy/nn/Module.py b/torch/legacy/nn/Module.py
index 6bee40db2c..5d599a109d 100644
--- a/torch/legacy/nn/Module.py
+++ b/torch/legacy/nn/Module.py
@@ -2,6 +2,7 @@ import torch
 import torch._thnn
 from .utils import clear, recursiveType
 
+
 class Module(object):
 
     def __init__(self):
@@ -36,13 +37,11 @@ class Module(object):
         self.accGradParameters(input, gradOutput, scale)
         return self.gradInput
 
-
     def backwardUpdate(self, input, gradOutput, lr):
         self.updateGradInput(input, gradOutput)
         self.accUpdateGradParameters(input, gradOutput, lr)
         return self.gradInput
 
-
     def updateGradInput(self, input, gradOutput):
         return self.gradInput
 
@@ -58,7 +57,6 @@ class Module(object):
         self.gradWeight = gradWeight
         self.gradBias = gradBias
 
-
     def sharedAccUpdateGradParameters(self, input, gradOutput, lr):
         if self.parameters():
             self.zeroGradParameters()
@@ -92,7 +90,7 @@ class Module(object):
 
     def type(self, type=None, tensorCache=None):
         if type is None:
-           return self._type
+            return self._type
 
         tensorCache = tensorCache or {}
 
@@ -146,6 +144,7 @@ class Module(object):
     #
     # TODO: This logically belongs to torch.Tensor, not nn.
     _flattenTensorBuffer = {}
+
     def _flatten(self, parameters=[]):
 
         # returns True if tensor occupies a contiguous region of memory (no holes)
@@ -155,14 +154,14 @@ class Module(object):
             sortedSize = torch.LongTensor(list(tensor.size())).index_select(0, perm)
             nRealDim = int(torch.clamp(sortedStride, 0, 1).sum())
             sortedStride = sortedStride.narrow(0, 0, nRealDim).clone()
-            sortedSize   = sortedSize.narrow(0, 0, nRealDim).clone()
+            sortedSize = sortedSize.narrow(0, 0, nRealDim).clone()
             t = tensor.new().set_(tensor.storage(), 0,
-                                 tuple(sortedSize),
-                                 tuple(sortedStride))
+                                  tuple(sortedSize),
+                                  tuple(sortedStride))
             return t.is_contiguous()
 
         if not parameters:
-           return torch.Tensor()
+            return torch.Tensor()
 
         Tensor = parameters[0].new
         BufferTensor = Module._flattenTensorBuffer.get(type(parameters[0]), Tensor)
@@ -179,14 +178,12 @@ class Module(object):
                 storages[key] = (storage, num_parameters)
                 num_parameters = num_parameters + storage.size()
 
-
             parameterMeta.append({
-                    'storage_offset':  param.storage_offset() + storages[key][1],
-                    'size'         :  param.size(),
-                    'stride'       :  param.stride()
+                'storage_offset': param.storage_offset() + storages[key][1],
+                'size': param.size(),
+                'stride': param.stride()
             })
 
-
         # 2. construct a single tensor that will hold all the parameters
         flatParameters = BufferTensor(num_parameters).zero_()
 
@@ -198,14 +195,14 @@ class Module(object):
             tmp.fill_(1)
             tensorsCompact = tensorsCompact and isCompact(tmp)
 
-        maskParameters  = flatParameters.byte().clone()
-        compactOffsets  = flatParameters.long().cumsum(0)
+        maskParameters = flatParameters.byte().clone()
+        compactOffsets = flatParameters.long().cumsum(0)
         used_parameters = compactOffsets[-1]
 
         # 4. copy storages into the flattened parameter tensor
         for storageAndOffset in storages.values():
             storage, offset = storageAndOffset
-            flatParameters[slice(offset, offset+storage.size())].copy_(Tensor().set_(storage))
+            flatParameters[slice(offset, offset + storage.size())].copy_(Tensor().set_(storage))
 
         # 5. allow garbage collection
         storages = None
@@ -214,22 +211,22 @@ class Module(object):
 
         # 6. compact the flattened parameters if there were holes
         if used_parameters != num_parameters:
-           assert tensorsCompact
+            assert tensorsCompact
 
-           flatParameters = BufferTensor(used_parameters).copy_(
-                 flatParameters.masked_select(maskParameters))
-           for meta in parameterMeta:
-               meta['storage_offset'] = compactOffsets[meta['storage_offset']]
+            flatParameters = BufferTensor(used_parameters).copy_(
+                flatParameters.masked_select(maskParameters))
+            for meta in parameterMeta:
+                meta['storage_offset'] = compactOffsets[meta['storage_offset']]
 
         if BufferTensor != Tensor:
-           flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
+            flatParameters = Tensor(flatParameters.nelement()).copy_(flatParameters)
 
         # 7. fix up the parameter tensors to point at the flattened parameters
         for param, meta in zip(parameters, parameterMeta):
-           param.set_(flatParameters.storage(),
-                     meta['storage_offset'],
-                     meta['size'],
-                     meta['stride'])
+            param.set_(flatParameters.storage(),
+                       meta['storage_offset'],
+                       meta['size'],
+                       meta['stride'])
 
         return flatParameters
 
@@ -290,4 +287,3 @@ class Module(object):
             for i, module in enumerate(self.modules):
                 self.modules[i] = module.replace(callback)
         return out
-
diff --git a/torch/legacy/nn/Mul.py b/torch/legacy/nn/Mul.py
index d1d1cfb0d8..4ba0567506 100644
--- a/torch/legacy/nn/Mul.py
+++ b/torch/legacy/nn/Mul.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class Mul(Module):
 
     def __init__(self):
@@ -12,9 +13,9 @@ class Mul(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1./math.sqrt(self.weight.size(0))
+            stdv = 1. / math.sqrt(self.weight.size(0))
         self.weight.uniform_(-stdv, stdv)
 
     def updateOutput(self, input):
@@ -28,5 +29,4 @@ class Mul(Module):
         return self.gradInput
 
     def accGradParameters(self, input, gradOutput, scale=1):
-        self.gradWeight[0] = self.gradWeight[0] + scale*input.dot(gradOutput);
-
+        self.gradWeight[0] = self.gradWeight[0] + scale * input.dot(gradOutput)
diff --git a/torch/legacy/nn/MulConstant.py b/torch/legacy/nn/MulConstant.py
index 1865b29110..6652ffbaac 100644
--- a/torch/legacy/nn/MulConstant.py
+++ b/torch/legacy/nn/MulConstant.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class MulConstant(Module):
 
     def __init__(self, constant_scalar, inplace=False):
@@ -19,7 +20,6 @@ class MulConstant(Module):
 
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
             return
@@ -35,4 +35,3 @@ class MulConstant(Module):
             self.gradInput.mul_(self.constant_scalar)
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/MultiCriterion.py b/torch/legacy/nn/MultiCriterion.py
index 969894036a..455b32cf92 100644
--- a/torch/legacy/nn/MultiCriterion.py
+++ b/torch/legacy/nn/MultiCriterion.py
@@ -2,6 +2,7 @@ import torch
 from .Criterion import Criterion
 from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
 
+
 class MultiCriterion(Criterion):
 
     def __init__(self, ):
@@ -14,7 +15,7 @@ class MultiCriterion(Criterion):
         new_weights = torch.DoubleStorage(len(self.criterions))
         for i, v in enumerate(self.weights):
             new_weights[i] = v
-        new_weights[len(self.criterions)-1] = weight
+        new_weights[len(self.criterions) - 1] = weight
         self.weights = new_weights
         return self
 
@@ -29,13 +30,12 @@ class MultiCriterion(Criterion):
         self.gradInput = recursiveResizeAs(self.gradInput, input)[0]
         recursiveFill(self.gradInput, 0)
         for i in range(len(self.criterions)):
-           recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
+            recursiveAdd(self.gradInput, self.weights[i], self.criterions[i].updateGradInput(input, target))
 
         return self.gradInput
 
     def type(self, type):
         for criterion in self.criterions:
-           criterion.type(type)
+            criterion.type(type)
 
         return super(MultiCriterion, self).type(type)
-
diff --git a/torch/legacy/nn/MultiLabelMarginCriterion.py b/torch/legacy/nn/MultiLabelMarginCriterion.py
index 4fbed6d1a5..42d6f7ac91 100644
--- a/torch/legacy/nn/MultiLabelMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelMarginCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class MultiLabelMarginCriterion(Criterion):
 
     def __init__(self, sizeAverage=True):
@@ -11,7 +12,7 @@ class MultiLabelMarginCriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         target = target.long()
         self._backend.MultiLabelMarginCriterion_updateOutput(
             self._backend.library_state,
@@ -35,4 +36,3 @@ class MultiLabelMarginCriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
index 4cf509efe4..59b2b29b86 100644
--- a/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
+++ b/torch/legacy/nn/MultiLabelSoftMarginCriterion.py
@@ -3,6 +3,7 @@ from .Criterion import Criterion
 from .Sigmoid import Sigmoid
 from .BCECriterion import BCECriterion
 
+
 class MultiLabelSoftMarginCriterion(Criterion):
     """
     A MultiLabel multiclass criterion based on sigmoid:
@@ -18,24 +19,23 @@ class MultiLabelSoftMarginCriterion(Criterion):
     """
 
     def __init__(self, weights=None):
-         super(MultiLabelSoftMarginCriterion, self).__init__()
-         self.lsm = Sigmoid()
-         self.nll = BCECriterion(weights)
+        super(MultiLabelSoftMarginCriterion, self).__init__()
+        self.lsm = Sigmoid()
+        self.nll = BCECriterion(weights)
 
     def updateOutput(self, input, target):
-         input = input if input.nelement() == 1 else input.squeeze()
-         target = target if target.nelement() == 1 else target.squeeze()
-         self.lsm.updateOutput(input)
-         self.nll.updateOutput(self.lsm.output, target)
-         self.output = self.nll.output
-         return self.output
+        input = input if input.nelement() == 1 else input.squeeze()
+        target = target if target.nelement() == 1 else target.squeeze()
+        self.lsm.updateOutput(input)
+        self.nll.updateOutput(self.lsm.output, target)
+        self.output = self.nll.output
+        return self.output
 
     def updateGradInput(self, input, target):
-         size = input.size()
-         input = input if input.nelement() == 1 else input.squeeze()
-         target = target if target.nelement() == 1 else target.squeeze()
-         self.nll.updateGradInput(self.lsm.output, target)
-         self.lsm.updateGradInput(input, self.nll.gradInput)
-         self.gradInput = self.lsm.gradInput.view(size)
-         return self.gradInput
-
+        size = input.size()
+        input = input if input.nelement() == 1 else input.squeeze()
+        target = target if target.nelement() == 1 else target.squeeze()
+        self.nll.updateGradInput(self.lsm.output, target)
+        self.lsm.updateGradInput(input, self.nll.gradInput)
+        self.gradInput = self.lsm.gradInput.view(size)
+        return self.gradInput
diff --git a/torch/legacy/nn/MultiMarginCriterion.py b/torch/legacy/nn/MultiMarginCriterion.py
index a2e11d3ebb..f6b636fee0 100644
--- a/torch/legacy/nn/MultiMarginCriterion.py
+++ b/torch/legacy/nn/MultiMarginCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class MultiMarginCriterion(Criterion):
 
     def __init__(self, p=1, weights=None, margin=1, sizeAverage=True):
@@ -17,7 +18,7 @@ class MultiMarginCriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         target = target.long()
         self._backend.MultiMarginCriterion_updateOutput(
             self._backend.library_state,
@@ -32,7 +33,6 @@ class MultiMarginCriterion(Criterion):
         self.output = self.output_tensor[0]
         return self.output
 
-
     def updateGradInput(self, input, target):
         target = target.long()
         self._backend.MultiMarginCriterion_updateGradInput(
@@ -46,4 +46,3 @@ class MultiMarginCriterion(Criterion):
             self.margin
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/Narrow.py b/torch/legacy/nn/Narrow.py
index 65997922d3..419be6cb2b 100644
--- a/torch/legacy/nn/Narrow.py
+++ b/torch/legacy/nn/Narrow.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Narrow(Module):
 
     def __init__(self, dimension, offset, length=1):
@@ -12,21 +13,19 @@ class Narrow(Module):
     def updateOutput(self, input):
         length = self.length
         if length < 0:
-           length = input.size(self.dimension) - self.index + self.length + 1
+            length = input.size(self.dimension) - self.index + self.length + 1
 
         output = input.narrow(self.dimension, self.index, length)
         self.output = self.output.type_as(output)
         self.output.resize_as_(output).copy_(output)
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         length = self.length
         if length < 0:
-           length = input.size(self.dimension) - self.index + self.length + 1
+            length = input.size(self.dimension) - self.index + self.length + 1
 
         self.gradInput = self.gradInput.type_as(input)
         self.gradInput.resize_as_(input).zero_()
         self.gradInput.narrow(self.dimension, self.index, length).copy_(gradOutput)
         return self.gradInput
-
diff --git a/torch/legacy/nn/NarrowTable.py b/torch/legacy/nn/NarrowTable.py
index 5176259f83..48d8a03f55 100644
--- a/torch/legacy/nn/NarrowTable.py
+++ b/torch/legacy/nn/NarrowTable.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear, recursiveResizeAs, recursiveFill
 
+
 class NarrowTable(Module):
 
     def __init__(self, offset, length=1):
@@ -11,7 +12,6 @@ class NarrowTable(Module):
         self.output = []
         self.gradInput = []
 
-
     def updateOutput(self, input):
         self.output[:] = [input[self.offset + i] for i in range(self.length)]
         return self.output
@@ -34,10 +34,8 @@ class NarrowTable(Module):
 
         return self.gradInput
 
-
     def type(self, type=None, tensorCache=None):
         if not type:
             return self._type
         clear(self, 'output', 'gradInput')
         return super(NarrowTable, self).type(self, type, tensorCache)
-
diff --git a/torch/legacy/nn/Normalize.py b/torch/legacy/nn/Normalize.py
index 81cd4f9d4d..a96fcfcff1 100644
--- a/torch/legacy/nn/Normalize.py
+++ b/torch/legacy/nn/Normalize.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class Normalize(Module):
 
     def __init__(self, p, eps=1e-10):
@@ -19,17 +20,16 @@ class Normalize(Module):
         self.cross = None
         self.buffer2 = None
 
-
     def updateOutput(self, input):
         assert input.dim() == 2
         input_size = input.size()
 
         if self._output is None:
-              self._output = input.new()
+            self._output = input.new()
         if self.norm is None:
-              self.norm = input.new()
+            self.norm = input.new()
         if self.buffer is None:
-              self.buffer = input.new()
+            self.buffer = input.new()
 
         self._output.resize_as_(input)
 
@@ -44,14 +44,14 @@ class Normalize(Module):
             self.norm.add_(self.eps)
         else:
             if self.normp is None:
-                  self.normp = input.new()
+                self.normp = input.new()
             if self.p % 2 != 0:
                 torch.abs(input, out=self.buffer).pow_(self.p)
             else:
                 torch.pow(input, self.p, out=self.buffer)
 
             torch.sum(self.buffer, 1, out=self.normp).add_(self.eps)
-            torch.pow(self.normp, 1./self.p, out=self.norm)
+            torch.pow(self.normp, 1. / self.p, out=self.norm)
 
         torch.div(input, self.norm.view(-1, 1).expand_as(input), out=self._output)
 
@@ -63,40 +63,40 @@ class Normalize(Module):
         assert gradOutput.dim() == 2
 
         input_size = input.size()
-        n = input.size(0) # batch size
-        d = input.size(1) # dimensionality of vectors
+        n = input.size(0)  # batch size
+        d = input.size(1)  # dimensionality of vectors
 
         if self._gradInput is None:
-              self._gradInput = input.new()
+            self._gradInput = input.new()
         if self.cross is None:
-              self.cross = input.new()
+            self.cross = input.new()
         # compute diagonal term with gradOutput
         self._gradInput.resize_(n, d)
         if self.p == float('inf'):
                 # specialization for the inf case
-                torch.mul(self.norm.view(n, 1,1).expand(n, d,1), gradOutput, out=self._gradInput)
-                self.buffer.resize_as_(input).zero_()
-                self.cross.resize_(n, 1)
-                torch.gather(input, 1, self._indices, out=self.cross)
-                self.cross.div_(self.norm)
-                self.buffer.scatter_(1, self._indices, self.cross)
+            torch.mul(self.norm.view(n, 1, 1).expand(n, d, 1), gradOutput, out=self._gradInput)
+            self.buffer.resize_as_(input).zero_()
+            self.cross.resize_(n, 1)
+            torch.gather(input, 1, self._indices, out=self.cross)
+            self.cross.div_(self.norm)
+            self.buffer.scatter_(1, self._indices, self.cross)
         else:
-                torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
-                # small optimizations for different p
-                # buffer = input*|input|^(p-2)
-                # for non-even p, need to add absolute value
-                if self.p % 2 != 0:
-                    if self.p < 2:
-                        # add eps to avoid possible division by 0
-                        torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p-2).mul_(input)
-                    else:
-                        torch.abs(input, out=self.buffer).pow_(self.p-2).mul_(input)
-                # special case for p == 2, pow(x, 0) = 1
-                elif self.p == 2:
-                    self.buffer.copy_(input)
+            torch.mul(self.normp.view(n, 1).expand(n, d), gradOutput, out=self._gradInput)
+            # small optimizations for different p
+            # buffer = input*|input|^(p-2)
+            # for non-even p, need to add absolute value
+            if self.p % 2 != 0:
+                if self.p < 2:
+                    # add eps to avoid possible division by 0
+                    torch.abs(input, out=self.buffer).add_(self.eps).pow_(self.p - 2).mul_(input)
                 else:
-                    # p is even and > 2, pow(x, p) is always positive
-                    torch.pow(input, self.p-2, out=self.buffer).mul_(input)
+                    torch.abs(input, out=self.buffer).pow_(self.p - 2).mul_(input)
+            # special case for p == 2, pow(x, 0) = 1
+            elif self.p == 2:
+                self.buffer.copy_(input)
+            else:
+                # p is even and > 2, pow(x, p) is always positive
+                torch.pow(input, self.p - 2, out=self.buffer).mul_(input)
 
         # compute cross term in two steps
         self.cross.resize_(n, 1)
@@ -105,7 +105,7 @@ class Normalize(Module):
         #: the computations as b1*(b2*gradOutput). This avoids redundant
         # computation and also a huge buffer of size n*d^2
         if self.buffer2 is None:
-              self.buffer2 = input.new() # nxd
+            self.buffer2 = input.new()  # nxd
         torch.mul(input, gradOutput, out=self.buffer2)
         torch.sum(self.buffer2, 1, out=self.cross)
 
@@ -143,13 +143,12 @@ class Normalize(Module):
 
     def clearState(self):
         clear(self, [
-           '_output',
-           '_indices',
-           '_gradInput',
-           'buffer',
-           'norm',
-           'normp',
-           'cross',
+            '_output',
+            '_indices',
+            '_gradInput',
+            'buffer',
+            'norm',
+            'normp',
+            'cross',
         ])
         return super(Normalize, self).clearState()
-
diff --git a/torch/legacy/nn/PReLU.py b/torch/legacy/nn/PReLU.py
index 5f2e1946af..f11d1bcaec 100644
--- a/torch/legacy/nn/PReLU.py
+++ b/torch/legacy/nn/PReLU.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class PReLU(Module):
 
     def __init__(self, nOutputPlane=0):
@@ -36,9 +37,9 @@ class PReLU(Module):
 
     def accGradParameters(self, input, gradOutput, scale=1):
         if self.gradWeightBuf is None:
-              self.gradWeightBuf = input.new()
+            self.gradWeightBuf = input.new()
         if self.gradWeightBuf2 is None:
-              self.gradWeightBuf2 = input.new()
+            self.gradWeightBuf2 = input.new()
         self._backend.PReLU_accGradParameters(
             self._backend.library_state,
             input,
@@ -56,4 +57,3 @@ class PReLU(Module):
     def clearState(self):
         clear(self, 'gradWeightBuf', 'gradWeightBuf2')
         return super(PReLU, self).clearState()
-
diff --git a/torch/legacy/nn/Padding.py b/torch/legacy/nn/Padding.py
index aa13362e24..db5fd83467 100644
--- a/torch/legacy/nn/Padding.py
+++ b/torch/legacy/nn/Padding.py
@@ -35,7 +35,8 @@ class Padding(Module):
             self.output.narrow(dim, 0, input.size(dim)).copy_(input)
         else:
             self.output.narrow(dim, 0, index).copy_(input.narrow(dim, 0, index))
-            self.output.narrow(dim, index + pad, input.size(dim) - index).copy_(input.narrow(dim, index, input.size(dim) - index))
+            self.output.narrow(dim, index + pad, input.size(dim) -
+                               index).copy_(input.narrow(dim, index, input.size(dim) - index))
 
         return self.output
 
@@ -56,6 +57,7 @@ class Padding(Module):
             self.gradInput.copy_(gradOutput.narrow(dim, 0, input.size(dim)))
         else:
             self.gradInput.narrow(dim, 0, index).copy_(gradOutput.narrow(dim, 0, index))
-            self.gradInput.narrow(dim, index, input.size(dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
+            self.gradInput.narrow(dim, index, input.size(
+                dim) - index).copy_(gradOutput.narrow(dim, index + pad, input.size(dim) - index))
 
         return self.gradInput
diff --git a/torch/legacy/nn/PairwiseDistance.py b/torch/legacy/nn/PairwiseDistance.py
index d4a7571a6f..cf083daf1d 100644
--- a/torch/legacy/nn/PairwiseDistance.py
+++ b/torch/legacy/nn/PairwiseDistance.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class PairwiseDistance(Module):
 
     def __init__(self, p):
@@ -20,14 +21,14 @@ class PairwiseDistance(Module):
         assert input[0].dim() == 2
 
         if self.diff is None:
-              self.diff = input[0].new()
+            self.diff = input[0].new()
 
         torch.add(input[0], -1, input[1], out=self.diff).abs_()
 
         self.output.resize_(input[0].size(0))
         self.output.zero_()
         self.output.add_(self.diff.pow_(self.norm).sum(1))
-        self.output.pow_(1./self.norm)
+        self.output.pow_(1. / self.norm)
 
         return self.output
 
@@ -38,10 +39,10 @@ class PairwiseDistance(Module):
             self.gradInput[:] = [None, None]
 
         if self.gradInput[0] is None:
-              self.gradInput[0] = input[0].new()
+            self.gradInput[0] = input[0].new()
         self.gradInput[0].resize_(input[0].size())
         if self.gradInput[1] is None:
-              self.gradInput[1] = input[1].new()
+            self.gradInput[1] = input[1].new()
         self.gradInput[1].resize_(input[1].size())
         self.gradInput[0].copy_(input[0])
         self.gradInput[0].add_(-1, input[1])
@@ -52,21 +53,21 @@ class PairwiseDistance(Module):
             # Note: derivative of p-norm:
             # d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
             if self.norm > 2:
-                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm-2))
+                self.gradInput[0].mul_(self.gradInput[0].abs().pow_(self.norm - 2))
 
             if self.outExpand is None:
-                  self.outExpand = self.output.new()
+                self.outExpand = self.output.new()
             self.outExpand.resize_(self.output.size(0), 1)
             self.outExpand.copy_(self.output)
             self.outExpand.add_(1e-6)  # Prevent divide by zero errors
-            self.outExpand.pow_(-(self.norm-1))
+            self.outExpand.pow_(-(self.norm - 1))
             self.gradInput[0].mul_(self.outExpand.expand(self.gradInput[0].size(0),
-                self.gradInput[0].size(1)))
+                                                         self.gradInput[0].size(1)))
 
         if self.grad is None:
-              self.grad = gradOutput.new()
+            self.grad = gradOutput.new()
         if self.ones is None:
-              self.ones = gradOutput.new()
+            self.ones = gradOutput.new()
 
         self.grad.resize_as_(input[0]).zero_()
         self.ones.resize_(input[0].size(1)).fill_(1)
@@ -80,4 +81,3 @@ class PairwiseDistance(Module):
     def clearState(self):
         clear(self, 'diff', 'outExpand', 'grad', 'ones')
         return super(PairwiseDistance, self).clearState()
-
diff --git a/torch/legacy/nn/Parallel.py b/torch/legacy/nn/Parallel.py
index df94084c6e..6db1c060e3 100644
--- a/torch/legacy/nn/Parallel.py
+++ b/torch/legacy/nn/Parallel.py
@@ -94,8 +94,9 @@ class Parallel(Container):
         res = torch.typename(self)
         res += ' {' + line + tab + 'input'
         for i in range(len(self.modules)):
-            if i == len(self.modules)-1:
-                res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
+            if i == len(self.modules) - 1:
+                res += line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
             else:
                 res += line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
 
diff --git a/torch/legacy/nn/ParallelCriterion.py b/torch/legacy/nn/ParallelCriterion.py
index a44541caab..7ecfd95c6b 100644
--- a/torch/legacy/nn/ParallelCriterion.py
+++ b/torch/legacy/nn/ParallelCriterion.py
@@ -2,6 +2,7 @@ import torch
 from .Criterion import Criterion
 from .utils import recursiveResizeAs, recursiveFill, recursiveAdd
 
+
 class ParallelCriterion(Criterion):
 
     def __init__(self, repeatTarget=False):
@@ -36,4 +37,3 @@ class ParallelCriterion(Criterion):
     def type(self, type=None, tensorCache=None):
         self.gradInput = []
         return super(ParallelCriterion, self).type(type, tensorCache)
-
diff --git a/torch/legacy/nn/ParallelTable.py b/torch/legacy/nn/ParallelTable.py
index c3a78a16ba..41912a6a10 100644
--- a/torch/legacy/nn/ParallelTable.py
+++ b/torch/legacy/nn/ParallelTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Container import Container
 
+
 class ParallelTable(Container):
 
     def __init__(self, ):
@@ -9,7 +10,6 @@ class ParallelTable(Container):
         self.output = []
         self.gradInput = []
 
-
     def updateOutput(self, input):
         for i in range(len(self.modules)):
             tmp = self.modules[i].updateOutput(input[i])
@@ -48,13 +48,13 @@ class ParallelTable(Container):
         res = torch.typename(self)
         res = res + ' {' + line + tab + 'input'
         for i in range(len(self.modules)):
-           if i == len(self.modules)-1:
-              res = res + line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + extlast)
-           else:
-              res = res + line + tab + next + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab + ext)
-
+            if i == len(self.modules) - 1:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + extlast)
+            else:
+                res = res + line + tab + next + '(' + str(i) + '): ' + \
+                    str(self.modules[i]).replace(line, line + tab + ext)
 
         res = res + line + tab + last + 'output'
         res = res + line + '}'
         return res
-
diff --git a/torch/legacy/nn/PartialLinear.py b/torch/legacy/nn/PartialLinear.py
index 4f9ca847a3..d4e9030e02 100644
--- a/torch/legacy/nn/PartialLinear.py
+++ b/torch/legacy/nn/PartialLinear.py
@@ -6,6 +6,7 @@ from .Sequential import Sequential
 from .ParallelTable import ParallelTable
 from .MM import MM
 
+
 class PartialLinear(Module):
     """
     PartialLinear is a Linear layer that allows the user to a set a collection of
@@ -27,15 +28,15 @@ class PartialLinear(Module):
         pt.add(Identity()).add(LookupTable(outputsize, inputsize))
         self.network = Sequential().add(pt).add(MM(False, True))
         if bias:
-            self.bias     = torch.zeros(1, outputsize)
+            self.bias = torch.zeros(1, outputsize)
             self.gradBias = torch.zeros(1, outputsize)
         else:
             self.bias = self.gradBias = None
 
         # set partition:
-        self.inputsize  = inputsize
+        self.inputsize = inputsize
         self.outputsize = outputsize
-        self.allcolumns = torch.range(0, self.outputsize-1).long()
+        self.allcolumns = torch.range(0, self.outputsize - 1).long()
         self.resetPartition()
         self.addBuffer = None
         self.buffer = None
@@ -58,7 +59,7 @@ class PartialLinear(Module):
         if self.bias is not None:
             self.output.add_(torch.index_select(self.bias, 1, self.partition).expand_as(self.output))
             if self.addBuffer is None:
-                  self.addBuffer = input.new()
+                self.addBuffer = input.new()
             if self.addBuffer.nelement() != input.size(0):
                 self.addBuffer.resize_(input.size(0)).fill_(1)
 
@@ -66,8 +67,8 @@ class PartialLinear(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is not None:
-           self.network.updateGradInput([input, self.partition], gradOutput)
-           self.gradInput.set_(self.network.gradInput[0])
+            self.network.updateGradInput([input, self.partition], gradOutput)
+            self.gradInput.set_(self.network.gradInput[0])
 
         return self.gradInput
 
@@ -110,6 +111,5 @@ class PartialLinear(Module):
 
     def __repr__(self):
         return super(ParallelTable, self).__repr__() + \
-           '({} -> {})'.format(self.inputsize, self.outputsize) + \
-           ' without bias' if self.bias is None else ''
-
+            '({} -> {})'.format(self.inputsize, self.outputsize) + \
+            ' without bias' if self.bias is None else ''
diff --git a/torch/legacy/nn/Power.py b/torch/legacy/nn/Power.py
index f86f5e6235..20b23baefd 100644
--- a/torch/legacy/nn/Power.py
+++ b/torch/legacy/nn/Power.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Power(Module):
 
     def __init__(self, p):
@@ -17,4 +18,3 @@ class Power(Module):
         self.gradInput.pow_(self.pow - 1)
         self.gradInput.mul_(gradOutput).mul_(self.pow)
         return self.gradInput
-
diff --git a/torch/legacy/nn/RReLU.py b/torch/legacy/nn/RReLU.py
index e1c9c83a52..237d927da7 100644
--- a/torch/legacy/nn/RReLU.py
+++ b/torch/legacy/nn/RReLU.py
@@ -2,9 +2,10 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class RReLU(Module):
 
-    def __init__(self, lower=1./8, upper=1./3, inplace=False):
+    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
         super(RReLU, self).__init__()
         self.lower = lower
         self.upper = upper
@@ -48,4 +49,3 @@ class RReLU(Module):
     def clearState(self):
         clear(self, 'noise')
         return super(RReLU, self).clearState()
-
diff --git a/torch/legacy/nn/ReLU.py b/torch/legacy/nn/ReLU.py
index 617ade9ba8..2674f47cf9 100644
--- a/torch/legacy/nn/ReLU.py
+++ b/torch/legacy/nn/ReLU.py
@@ -1,8 +1,8 @@
 import torch
 from .Threshold import Threshold
 
+
 class ReLU(Threshold):
 
     def __init__(self, inplace=False):
         super(ReLU, self).__init__(0, 0, inplace)
-
diff --git a/torch/legacy/nn/ReLU6.py b/torch/legacy/nn/ReLU6.py
index d833d6139f..cb8b59d2b5 100644
--- a/torch/legacy/nn/ReLU6.py
+++ b/torch/legacy/nn/ReLU6.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class ReLU6(Module):
 
     def __init__(self, inplace=False):
@@ -16,7 +17,6 @@ class ReLU6(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self._backend.HardTanh_updateGradInput(
             self._backend.library_state,
@@ -26,4 +26,3 @@ class ReLU6(Module):
             0, 6, self.inplace
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/Replicate.py b/torch/legacy/nn/Replicate.py
index 3923b06bc4..10f4d80884 100644
--- a/torch/legacy/nn/Replicate.py
+++ b/torch/legacy/nn/Replicate.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Replicate(Module):
 
     def __init__(self, nf, dim=0):
diff --git a/torch/legacy/nn/Reshape.py b/torch/legacy/nn/Reshape.py
index 49c7029736..23d5ad9b8e 100644
--- a/torch/legacy/nn/Reshape.py
+++ b/torch/legacy/nn/Reshape.py
@@ -23,7 +23,7 @@ class Reshape(Module):
     def updateOutput(self, input):
         if not input.is_contiguous():
             if self._input is None:
-                  self._input = input.new()
+                self._input = input.new()
             self._input.resize_as_(input)
             self._input.copy_(input)
             input = self._input
@@ -36,7 +36,7 @@ class Reshape(Module):
     def updateGradInput(self, input, gradOutput):
         if not gradOutput.is_contiguous():
             if self._gradOutput is None:
-                  self._gradOutput = gradOutput.new()
+                self._gradOutput = gradOutput.new()
             self._gradOutput.resize_as_(gradOutput)
             self._gradOutput.copy_(gradOutput)
             gradOutput = self._gradOutput
@@ -46,7 +46,7 @@ class Reshape(Module):
 
     def __repr__(self):
         return super(Reshape, self).__repr__() + \
-                '({})'.format('x'.join(map(lambda x: str(x), self.size)))
+            '({})'.format('x'.join(map(lambda x: str(x), self.size)))
 
     def clearState(self):
         clear(self, '_input', '_gradOutput')
diff --git a/torch/legacy/nn/Select.py b/torch/legacy/nn/Select.py
index cb6d77fb51..287cb000e4 100644
--- a/torch/legacy/nn/Select.py
+++ b/torch/legacy/nn/Select.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Select(Module):
 
     def __init__(self, dimension, index):
@@ -20,4 +21,3 @@ class Select(Module):
         self.gradInput.zero_()
         self.gradInput.select(self.dimension, index).copy_(gradOutput)
         return self.gradInput
-
diff --git a/torch/legacy/nn/SelectTable.py b/torch/legacy/nn/SelectTable.py
index fe6e8b3585..7389a33bc3 100644
--- a/torch/legacy/nn/SelectTable.py
+++ b/torch/legacy/nn/SelectTable.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import recursiveCopy
 
+
 class SelectTable(Module):
 
     def __init__(self, index):
@@ -48,7 +49,5 @@ class SelectTable(Module):
             del self.output[:]
         return super(SelectTable, self).type(type, tensorCache)
 
-
     def __repr__(self):
         return super(SelectTable, self).__repr__() + '({})'.format(self.index)
-
diff --git a/torch/legacy/nn/Sequential.py b/torch/legacy/nn/Sequential.py
index 04974e61c4..e3c4a0034a 100644
--- a/torch/legacy/nn/Sequential.py
+++ b/torch/legacy/nn/Sequential.py
@@ -1,6 +1,7 @@
 import torch
 from .Container import Container
 
+
 class Sequential(Container):
 
     def __len__(self):
@@ -8,7 +9,7 @@ class Sequential(Container):
 
     def add(self, module):
         if len(self.modules) == 0:
-           self.gradInput = module.gradInput
+            self.gradInput = module.gradInput
 
         self.modules.append(module)
         self.output = module.output
@@ -75,12 +76,11 @@ class Sequential(Container):
         res = 'nn.Sequential'
         res = res + ' {' + line + tab + '[input'
         for i in range(len(self.modules)):
-           res = res + next + '(' + str(i) + ')'
+            res = res + next + '(' + str(i) + ')'
 
         res = res + next + 'output]'
         for i in range(len(self.modules)):
-           res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
+            res = res + line + tab + '(' + str(i) + '): ' + str(self.modules[i]).replace(line, line + tab)
 
         res = res + line + '}'
         return res
-
diff --git a/torch/legacy/nn/Sigmoid.py b/torch/legacy/nn/Sigmoid.py
index 40d849f61b..6e6343e6b0 100644
--- a/torch/legacy/nn/Sigmoid.py
+++ b/torch/legacy/nn/Sigmoid.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Sigmoid(Module):
 
     def updateOutput(self, input):
@@ -20,4 +21,3 @@ class Sigmoid(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SmoothL1Criterion.py b/torch/legacy/nn/SmoothL1Criterion.py
index 04748b0545..b16309e6d1 100644
--- a/torch/legacy/nn/SmoothL1Criterion.py
+++ b/torch/legacy/nn/SmoothL1Criterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class SmoothL1Criterion(Criterion):
 
     def __init__(self, sizeAverage=True):
@@ -10,7 +11,7 @@ class SmoothL1Criterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.SmoothL1Criterion_updateOutput(
             self._backend.library_state,
             input,
@@ -30,4 +31,3 @@ class SmoothL1Criterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SoftMarginCriterion.py b/torch/legacy/nn/SoftMarginCriterion.py
index 1e8a89f731..612e4e0c03 100644
--- a/torch/legacy/nn/SoftMarginCriterion.py
+++ b/torch/legacy/nn/SoftMarginCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class SoftMarginCriterion(Criterion):
 
     def __init__(self, ):
@@ -10,7 +11,7 @@ class SoftMarginCriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.SoftMarginCriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -30,4 +31,3 @@ class SoftMarginCriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SoftMax.py b/torch/legacy/nn/SoftMax.py
index e924554624..24d5fa5967 100644
--- a/torch/legacy/nn/SoftMax.py
+++ b/torch/legacy/nn/SoftMax.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SoftMax(Module):
 
     def updateOutput(self, input):
@@ -20,4 +21,3 @@ class SoftMax(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SoftMin.py b/torch/legacy/nn/SoftMin.py
index a6e8737fe7..7c1bbbff3f 100644
--- a/torch/legacy/nn/SoftMin.py
+++ b/torch/legacy/nn/SoftMin.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SoftMin(Module):
 
     def __init__(self):
@@ -10,7 +11,7 @@ class SoftMin(Module):
 
     def updateOutput(self, input):
         if self.mininput is None:
-              self.mininput = input.new()
+            self.mininput = input.new()
         self.mininput.resize_as_(input).copy_(input).mul_(-1)
         self._backend.SoftMax_updateOutput(
             self._backend.library_state,
@@ -21,7 +22,7 @@ class SoftMin(Module):
 
     def updateGradInput(self, input, gradOutput):
         if self.mininput is None:
-              self.mininput = input.new()
+            self.mininput = input.new()
         self.mininput.resize_as_(input).copy_(input).mul_(-1)
         self._backend.SoftMax_updateGradInput(
             self._backend.library_state,
@@ -37,4 +38,3 @@ class SoftMin(Module):
     def clearState(self):
         clear(self, 'mininput')
         return super(SoftMin, self).clearState()
-
diff --git a/torch/legacy/nn/SoftPlus.py b/torch/legacy/nn/SoftPlus.py
index b8f46d030a..854bc8d4fe 100644
--- a/torch/legacy/nn/SoftPlus.py
+++ b/torch/legacy/nn/SoftPlus.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SoftPlus(Module):
 
     def __init__(self, beta=1):
@@ -35,4 +36,3 @@ class SoftPlus(Module):
             self.threshold
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SoftShrink.py b/torch/legacy/nn/SoftShrink.py
index b663c54f47..a3ac316650 100644
--- a/torch/legacy/nn/SoftShrink.py
+++ b/torch/legacy/nn/SoftShrink.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SoftShrink(Module):
 
     def __init__(self, lambd=0.5):
@@ -25,4 +26,3 @@ class SoftShrink(Module):
             self.lambd
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SoftSign.py b/torch/legacy/nn/SoftSign.py
index c5e1bcb2aa..9aa58c1f7b 100644
--- a/torch/legacy/nn/SoftSign.py
+++ b/torch/legacy/nn/SoftSign.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SoftSign(Module):
 
     def __init__(self):
@@ -11,14 +12,14 @@ class SoftSign(Module):
 
     def updateOutput(self, input):
         if self.temp is None:
-              self.temp = input.new()
+            self.temp = input.new()
         self.temp.resize_as_(input).copy_(input).abs_().add_(1)
         self.output.resize_as_(input).copy_(input).div_(self.temp)
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         if self.tempgrad is None:
-              self.tempgrad = input.new()
+            self.tempgrad = input.new()
         self.tempgrad.resize_as_(self.output).copy_(input).abs_().add_(1).mul_(self.tempgrad)
         self.gradInput.resize_as_(input).copy_(gradOutput).div_(self.tempgrad)
         return self.gradInput
@@ -26,4 +27,3 @@ class SoftSign(Module):
     def clearState(self):
         clear(self, 'temp', 'tempgrad')
         return super(SoftSign, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialAdaptiveMaxPooling.py b/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
index f97a849e54..b8ed87492c 100644
--- a/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
+++ b/torch/legacy/nn/SpatialAdaptiveMaxPooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialAdaptiveMaxPooling(Module):
 
     def __init__(self, w, h):
@@ -12,7 +13,7 @@ class SpatialAdaptiveMaxPooling(Module):
 
     def updateOutput(self, input):
         if self.indices is None:
-              self.indices = input.new()
+            self.indices = input.new()
         self.indices = self.indices.long()
         self._backend.SpatialAdaptiveMaxPooling_updateOutput(
             self._backend.library_state,
@@ -37,4 +38,3 @@ class SpatialAdaptiveMaxPooling(Module):
     def clearState(self):
         clear(self, 'indices')
         return super(SpatialAdaptiveMaxPooling, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialAveragePooling.py b/torch/legacy/nn/SpatialAveragePooling.py
index 7be1b6b8e0..acf4c64083 100644
--- a/torch/legacy/nn/SpatialAveragePooling.py
+++ b/torch/legacy/nn/SpatialAveragePooling.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialAveragePooling(Module):
 
     def __init__(self, kW, kH, dW=1, dH=1, padW=0, padH=0):
@@ -46,7 +47,7 @@ class SpatialAveragePooling(Module):
         # for backward compatibility with saved models
         # which are not supposed to have "divide" field
         if not self.divide:
-            self.output.mul_(self.kW*self.kH)
+            self.output.mul_(self.kW * self.kH)
 
         return self.output
 
@@ -65,7 +66,7 @@ class SpatialAveragePooling(Module):
             )
             # for backward compatibility
             if not self.divide:
-                self.gradInput.mul_(self.kW*self.kH)
+                self.gradInput.mul_(self.kW * self.kH)
 
             return self.gradInput
 
@@ -76,4 +77,3 @@ class SpatialAveragePooling(Module):
             s += ', {}, {}'.format(self.padW, self.padH)
         s += ')'
         return s
-
diff --git a/torch/legacy/nn/SpatialBatchNormalization.py b/torch/legacy/nn/SpatialBatchNormalization.py
index 3fc70ed0dd..725ebfffc6 100644
--- a/torch/legacy/nn/SpatialBatchNormalization.py
+++ b/torch/legacy/nn/SpatialBatchNormalization.py
@@ -1,6 +1,7 @@
 import torch
 from .BatchNormalization import BatchNormalization
 
+
 class SpatialBatchNormalization(BatchNormalization):
     """
        This class implements Batch Normalization as described in the paper:
diff --git a/torch/legacy/nn/SpatialClassNLLCriterion.py b/torch/legacy/nn/SpatialClassNLLCriterion.py
index af1223ad53..95b9b5d084 100644
--- a/torch/legacy/nn/SpatialClassNLLCriterion.py
+++ b/torch/legacy/nn/SpatialClassNLLCriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class SpatialClassNLLCriterion(Criterion):
 
     def __init__(self, weights=None, sizeAverage=True):
diff --git a/torch/legacy/nn/SpatialContrastiveNormalization.py b/torch/legacy/nn/SpatialContrastiveNormalization.py
index 35e9e0bafc..d7e9251707 100644
--- a/torch/legacy/nn/SpatialContrastiveNormalization.py
+++ b/torch/legacy/nn/SpatialContrastiveNormalization.py
@@ -4,6 +4,7 @@ from .Sequential import Sequential
 from .SpatialSubtractiveNormalization import SpatialSubtractiveNormalization
 from .SpatialDivisiveNormalization import SpatialDivisiveNormalization
 
+
 class SpatialContrastiveNormalization(Module):
 
     def __init__(self, nInputPlane=1, kernel=None, threshold=1e-4, thresval=1e-4):
@@ -12,23 +13,23 @@ class SpatialContrastiveNormalization(Module):
         # get args
         self.nInputPlane = nInputPlane
         if kernel is None:
-              self.kernel = torch.Tensor(9, 9).fill_(1)
+            self.kernel = torch.Tensor(9, 9).fill_(1)
         self.threshold = threshold
         self.thresval = thresval or threshold
         kdim = self.kernel.ndimension()
 
         # check args
         if kdim != 2 and kdim != 1:
-           raise ValueError('SpatialContrastiveNormalization averaging kernel must be 2D or 1D')
+            raise ValueError('SpatialContrastiveNormalization averaging kernel must be 2D or 1D')
 
         if self.kernel.size(0) % 2 == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
-           raise ValueError('SpatialContrastiveNormalization averaging kernel must have ODD dimensions')
+            raise ValueError('SpatialContrastiveNormalization averaging kernel must have ODD dimensions')
 
         # instantiate sub+div normalization
         self.normalizer = Sequential()
         self.normalizer.add(SpatialSubtractiveNormalization(self.nInputPlane, self.kernel))
         self.normalizer.add(SpatialDivisiveNormalization(self.nInputPlane, self.kernel,
-                                                            self.threshold, self.thresval))
+                                                         self.threshold, self.thresval))
 
     def updateOutput(self, input):
         self.output = self.normalizer.forward(input)
@@ -37,4 +38,3 @@ class SpatialContrastiveNormalization(Module):
     def updateGradInput(self, input, gradOutput):
         self.gradInput = self.normalizer.backward(input, gradOutput)
         return self.gradInput
-
diff --git a/torch/legacy/nn/SpatialConvolution.py b/torch/legacy/nn/SpatialConvolution.py
index aa9b4c1b6a..d5d8163128 100644
--- a/torch/legacy/nn/SpatialConvolution.py
+++ b/torch/legacy/nn/SpatialConvolution.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialConvolution(Module):
 
     def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None):
@@ -36,9 +37,9 @@ class SpatialConvolution(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.kW*self.kH*self.nInputPlane)
+            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
 
         self.weight.uniform_(-stdv, stdv)
         if self.bias is not None:
@@ -46,15 +47,15 @@ class SpatialConvolution(Module):
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
-           if self._input is None:
-                  self._input = input.new()
-           self._input.resize_as_(input).copy_(input)
-           input = self._input
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
                 if self._gradOutput is None:
-                      self._gradOutput = gradOutput.new()
+                    self._gradOutput = gradOutput.new()
                 self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                 gradOutput = self._gradOutput
             return input, gradOutput
@@ -97,7 +98,6 @@ class SpatialConvolution(Module):
         self._unviewWeight()
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
             return
@@ -157,10 +157,9 @@ class SpatialConvolution(Module):
 
         s += ')'
         if self.bias is None:
-           s += ' without bias'
+            s += ' without bias'
         return s
 
     def clearState(self):
         clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
         return super(SpatialConvolution, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialConvolutionLocal.py b/torch/legacy/nn/SpatialConvolutionLocal.py
index cdc5d4a395..c87ff5f671 100644
--- a/torch/legacy/nn/SpatialConvolutionLocal.py
+++ b/torch/legacy/nn/SpatialConvolutionLocal.py
@@ -3,9 +3,10 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialConvolutionLocal(Module):
 
-    def __init__(self, nInputPlane, nOutputPlane, iW, iH ,kW, kH, dW=1, dH=1, padW=0, padH=None):
+    def __init__(self, nInputPlane, nOutputPlane, iW, iH, kW, kH, dW=1, dH=1, padW=0, padH=None):
         super(SpatialConvolutionLocal, self).__init__()
 
         self.nInputPlane = nInputPlane
@@ -34,19 +35,19 @@ class SpatialConvolutionLocal(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.kW*self.kH*self.nInputPlane)
+            stdv = 1. / math.sqrt(self.kW * self.kH * self.nInputPlane)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
-           if self._input is None:
-               self._input = input.new()
-           self._input.resize_as_(input).copy_(input)
-           input = self._input
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
@@ -61,22 +62,24 @@ class SpatialConvolutionLocal(Module):
     def _viewWeight(self):
         self.weight = self.weight.view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
         if self.gradWeight is not None and self.gradWeight.dim() > 0:
-           self.gradWeight = self.gradWeight.view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+            self.gradWeight = self.gradWeight.view(
+                self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
 
     def _unviewWeight(self):
         self.weight = self.weight.view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
         if self.gradWeight is not None and self.gradWeight.dim() > 0:
-           self.gradWeight = self.gradWeight.view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+            self.gradWeight = self.gradWeight.view(
+                self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
 
     def _checkInputSize(self, input):
         if input.ndimension() == 3:
             if input.size(0) != self.nInputPlane or input.size(1) != self.iH or input.size(1) != self.iW:
                 raise RuntimeError('Given input size: ({}x{}x{}) inconsistent with expected input size: ({}x{}x{}).'.format(
-                                  input.size(0), input.size(1), input.size(2), self.nInputPlane, self.iH, self.iW))
+                    input.size(0), input.size(1), input.size(2), self.nInputPlane, self.iH, self.iW))
         elif input.ndimension() == 4:
             if input.size(1) != self.nInputPlane or input.size(2) != self.iH or input.size(3) != self.iW:
                 raise RuntimeError('Given input size: ({}x{}x{}x{}) inconsistent with expected input size: (*x{}x{}x{}).'.format(
-                                    input.size(0), input.size(1), input.size(2), input.size(3), self.nInputPlane, self.iH, self.iW))
+                    input.size(0), input.size(1), input.size(2), input.size(3), self.nInputPlane, self.iH, self.iW))
         else:
             raise RuntimeError('3D or 4D (batch mode) tensor expected')
 
@@ -87,19 +90,19 @@ class SpatialConvolutionLocal(Module):
         if output.ndimension() == 3:
             if output.size(0) != self.nOutputPlane or output.size(1) != self.oH or output.size(2) != self.oW:
                 raise RuntimeError('Given output size: ({}x{}x{}) inconsistent with expected output size: ({}x{}x{}).'.format(
-                                    output.size(0), output.size(1), output.size(2), self.nOutputPlane, self.oH, self.oW))
+                    output.size(0), output.size(1), output.size(2), self.nOutputPlane, self.oH, self.oW))
         elif output.ndimension() == 4:
             if output.size(1) != self.nOutputPlane or output.size(2) != self.oH or output.size(3) != self.oW:
                 raise RuntimeError('Given output size: ({}x{}x{}x{}) inconsistent with expected output size: (batchsize x{}x{}x{}).'.format(
-                                    output.size(0), output.size(1), output.size(2), output.size(3), self.nOutputPlane, self.oH, self.oW))
+                    output.size(0), output.size(1), output.size(2), output.size(3), self.nOutputPlane, self.oH, self.oW))
         else:
             raise RuntimeError('3D or 4D(batch mode) tensor expected')
 
     def updateOutput(self, input):
         if self.finput is None:
-              self.finput = input.new()
+            self.finput = input.new()
         if self.fgradInput is None:
-              self.fgradInput = input.new()
+            self.fgradInput = input.new()
         self._checkInputSize(input)
         self._viewWeight()
         input = self._makeContiguous(input)
@@ -190,4 +193,3 @@ class SpatialConvolutionLocal(Module):
     def clearState(self):
         clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
         return super(SpatialConvolutionLocal, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialConvolutionMap.py b/torch/legacy/nn/SpatialConvolutionMap.py
index 7f4f7e6b22..e901140a52 100644
--- a/torch/legacy/nn/SpatialConvolutionMap.py
+++ b/torch/legacy/nn/SpatialConvolutionMap.py
@@ -5,13 +5,14 @@ from .Module import Module
 
 # TODO fix THNN...
 
+
 class SpatialConvolutionMap(Module):
 
     class maps(object):
 
         @staticmethod
         def full(nin, nout):
-            ft = torch.Tensor(nin*nout, 2)
+            ft = torch.Tensor(nin * nout, 2)
             p = 0
             for j in range(nout):
                 for i in range(nin):
@@ -34,19 +35,19 @@ class SpatialConvolutionMap(Module):
             tbl = torch.Tensor(nker, 2)
             fi = torch.randperm(nin)
             frcntr = 0
-            nfi = math.floor(nin / nto) # number of distinct nto chunks
+            nfi = math.floor(nin / nto)  # number of distinct nto chunks
             totbl = tbl.select(1, 1)
             frtbl = tbl.select(1, 0)
-            fitbl = fi.narrow(0, 0, (nfi * nto)) # part of fi that covers distinct chunks
+            fitbl = fi.narrow(0, 0, (nfi * nto))  # part of fi that covers distinct chunks
             ufrtbl = frtbl.unfold(0, nto, nto)
             utotbl = totbl.unfold(0, nto, nto)
             ufitbl = fitbl.unfold(0, nto, nto)
 
             # start fill_ing frtbl
-            for i in range(nout): # fro each unit in target map
+            for i in range(nout):  # fro each unit in target map
                 ufrtbl.select(0, i).copy_(ufitbl.select(0, frcntr))
                 frcntr += 1
-                if frcntr-1 == nfi: # reset fi
+                if frcntr - 1 == nfi:  # reset fi
                     fi.copy_(torch.randperm(nin))
                     frcntr = 1
 
@@ -80,11 +81,11 @@ class SpatialConvolutionMap(Module):
         else:
             ninp = torch.Tensor(self.nOutputPlane).zero_()
             for i in range(self.connTable.size(0)):
-                idx = int(self.connTable[i,1])
+                idx = int(self.connTable[i, 1])
                 ninp[idx] += 1
             for k in range(self.connTable.size(0)):
-                idx = int(self.connTable[k,1])
-                stdv = 1. / math.sqrt(self.kW*self.kH*ninp[idx])
+                idx = int(self.connTable[k, 1])
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
                 self.weight.select(0, k).uniform_(-stdv, stdv)
             for k in range(self.bias.size(0)):
                 stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
@@ -133,4 +134,3 @@ class SpatialConvolutionMap(Module):
             self.dW, self.dH,
             scale
         )
-
diff --git a/torch/legacy/nn/SpatialCrossMapLRN.py b/torch/legacy/nn/SpatialCrossMapLRN.py
index 7fa34c92d2..4b7402a46d 100644
--- a/torch/legacy/nn/SpatialCrossMapLRN.py
+++ b/torch/legacy/nn/SpatialCrossMapLRN.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialCrossMapLRN(Module):
 
     def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
@@ -19,7 +20,7 @@ class SpatialCrossMapLRN(Module):
         assert input.dim() == 4
 
         if self.scale is None:
-              self.scale = input.new()
+            self.scale = input.new()
         if input.type() == 'torch.cuda.FloatTensor':
             self._backend.SpatialCrossMapLRN_updateOutput(
                 self._backend.library_state,
@@ -32,10 +33,10 @@ class SpatialCrossMapLRN(Module):
                 self.k
             )
         else:
-            batchSize   = input.size(0)
-            channels    = input.size(1)
+            batchSize = input.size(0)
+            channels = input.size(1)
             inputHeight = input.size(2)
-            inputWidth  = input.size(3)
+            inputWidth = input.size(3)
 
             self.output.resize_as_(input)
             self.scale.resize_as_(input)
@@ -44,7 +45,7 @@ class SpatialCrossMapLRN(Module):
             inputSquare = self.output
             torch.pow(input, 2, out=inputSquare)
 
-            prePad = int((self.size - 1)/2 + 1)
+            prePad = int((self.size - 1) / 2 + 1)
             prePadCrop = channels if prePad > channels else prePad
 
             scaleFirst = self.scale.select(1, 0)
@@ -57,10 +58,10 @@ class SpatialCrossMapLRN(Module):
             # by adding the next feature map and removing the previous
             for c in range(1, channels):
                 scalePrevious = self.scale.select(1, c - 1)
-                scaleCurrent  = self.scale.select(1, c)
+                scaleCurrent = self.scale.select(1, c)
                 scaleCurrent.copy_(scalePrevious)
                 if c < channels - prePad + 1:
-                    squareNext   = inputSquare.select(1, c + prePad - 1)
+                    squareNext = inputSquare.select(1, c + prePad - 1)
                     scaleCurrent.add_(1, squareNext)
 
                 if c > prePad:
@@ -91,15 +92,15 @@ class SpatialCrossMapLRN(Module):
                 self.k
             )
         else:
-            batchSize   = input.size(0)
-            channels    = input.size(1)
+            batchSize = input.size(0)
+            channels = input.size(1)
             inputHeight = input.size(2)
-            inputWidth  = input.size(3)
+            inputWidth = input.size(3)
 
             if self.paddedRatio is None:
-                  self.paddedRatio = input.new()
+                self.paddedRatio = input.new()
             if self.accumRatio is None:
-                  self.accumRatio = input.new()
+                self.accumRatio = input.new()
             self.paddedRatio.resize_(channels + self.size - 1, inputHeight, inputWidth)
             self.accumRatio.resize_(inputHeight, inputWidth)
 
@@ -114,9 +115,9 @@ class SpatialCrossMapLRN(Module):
             for n in range(batchSize):
                 torch.mul(gradOutput[n], self.output[n], out=paddedRatioCenter)
                 paddedRatioCenter.div_(self.scale[n])
-                torch.sum(self.paddedRatio.narrow(0, 0,self.size-1), 0, out=self.accumRatio)
+                torch.sum(self.paddedRatio.narrow(0, 0, self.size - 1), 0, out=self.accumRatio)
                 for c in range(channels):
-                    self.accumRatio.add_(self.paddedRatio[c+self.size-1])
+                    self.accumRatio.add_(self.paddedRatio[c + self.size - 1])
                     self.gradInput[n][c].addcmul_(-cacheRatioValue, input[n][c], self.accumRatio)
                     self.accumRatio.add_(-1, self.paddedRatio[c])
 
@@ -125,4 +126,3 @@ class SpatialCrossMapLRN(Module):
     def clearState(self):
         clear(self, 'scale', 'paddedRatio', 'accumRatio')
         return super(SpatialCrossMapLRN, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialDilatedConvolution.py b/torch/legacy/nn/SpatialDilatedConvolution.py
index 0953638af3..73056c8966 100644
--- a/torch/legacy/nn/SpatialDilatedConvolution.py
+++ b/torch/legacy/nn/SpatialDilatedConvolution.py
@@ -1,6 +1,7 @@
 import torch
 from .SpatialConvolution import SpatialConvolution
 
+
 class SpatialDilatedConvolution(SpatialConvolution):
 
     def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, dilationH=1, dilationW=None):
@@ -11,9 +12,9 @@ class SpatialDilatedConvolution(SpatialConvolution):
 
     def updateOutput(self, input):
         if self.finput is None:
-              self.finput = self.weight.new()
+            self.finput = self.weight.new()
         if self.fgradInput is None:
-              self.fgradInput = self.weight.new()
+            self.fgradInput = self.weight.new()
         input = self._makeContiguous(input)
         self._backend.SpatialDilatedConvolution_updateOutput(
             self._backend.library_state,
@@ -36,7 +37,7 @@ class SpatialDilatedConvolution(SpatialConvolution):
 
         input, gradOutput = self._makeContiguous(input, gradOutput)
         if self.fgradInput is None:
-              self.fgradInput = self.weight.new()
+            self.fgradInput = self.weight.new()
         self._backend.SpatialDilatedConvolution_updateGradInput(
             self._backend.library_state,
             input,
@@ -54,7 +55,7 @@ class SpatialDilatedConvolution(SpatialConvolution):
     def accGradParameters(self, input, gradOutput, scale=1):
         input, gradOutput = self._makeContiguous(input, gradOutput)
         if self.fgradInput is None:
-              self.fgradInput = self.weight.new()
+            self.fgradInput = self.weight.new()
         self._backend.SpatialDilatedConvolution_accGradParameters(
             self._backend.library_state,
             input,
@@ -83,6 +84,5 @@ class SpatialDilatedConvolution(SpatialConvolution):
 
         s += ')'
         if self.bias is None:
-           s += ' without bias'
+            s += ' without bias'
         return s
-
diff --git a/torch/legacy/nn/SpatialDivisiveNormalization.py b/torch/legacy/nn/SpatialDivisiveNormalization.py
index 262a4aacb6..7d1b7246df 100644
--- a/torch/legacy/nn/SpatialDivisiveNormalization.py
+++ b/torch/legacy/nn/SpatialDivisiveNormalization.py
@@ -35,10 +35,10 @@ class SpatialDivisiveNormalization(Module):
             raise ValueError('SpatialDivisiveNormalization averaging kernel must have ODD dimensions')
 
         # padding values
-        padH = int(math.floor(self.kernel.size(0)/2))
+        padH = int(math.floor(self.kernel.size(0) / 2))
         padW = padH
         if kdim == 2:
-            padW = int(math.floor(self.kernel.size(1)/2))
+            padW = int(math.floor(self.kernel.size(1) / 2))
 
         # create convolutional mean estimator
         self.meanestimator = Sequential()
@@ -46,7 +46,8 @@ class SpatialDivisiveNormalization(Module):
         if kdim == 2:
             self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
         else:
-            self.meanestimator.add(SpatialConvolutionMap(SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.meanestimator.add(SpatialConvolutionMap(
+                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
             self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
 
         self.meanestimator.add(Replicate(self.nInputPlane, 1))
@@ -58,7 +59,8 @@ class SpatialDivisiveNormalization(Module):
         if kdim == 2:
             self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
         else:
-            self.stdestimator.add(SpatialConvolutionMap(SpatialContolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.stdestimator.add(SpatialConvolutionMap(
+                SpatialContolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
             self.stdestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
 
         self.stdestimator.add(Replicate(self.nInputPlane, 1))
@@ -102,14 +104,14 @@ class SpatialDivisiveNormalization(Module):
 
         # compute side coefficients
         dim = input.dim()
-        if self.localstds.dim() != self.coef.dim() or (input.size(dim-1) != self.coef.size(dim-1)) or (input.size(dim-2) != self.coef.size(dim-2)):
+        if self.localstds.dim() != self.coef.dim() or (input.size(dim - 1) != self.coef.size(dim - 1)) or (input.size(dim - 2) != self.coef.size(dim - 2)):
             if self.ones is None:
-                  self.ones = input.new()
+                self.ones = input.new()
             self.ones.resize_as_(input[0:1]).fill_(1)
             coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
             if self._coef is None:
-              self._coef = input.new()
-            self._coef.resize_as_(coef).copy_(coef) # make contiguous for view
+                self._coef = input.new()
+            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
             self.coef = self._coef.view(1, *self._coef.size()).expand_as(self.localstds)
 
         # normalize std dev
diff --git a/torch/legacy/nn/SpatialDropout.py b/torch/legacy/nn/SpatialDropout.py
index 7cfb987adb..fc05fcf27f 100644
--- a/torch/legacy/nn/SpatialDropout.py
+++ b/torch/legacy/nn/SpatialDropout.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialDropout(Module):
 
     def __init__(self, p=0.5):
@@ -18,20 +19,20 @@ class SpatialDropout(Module):
             else:
                 raise RuntimeError('Input must be 4D (nbatch, nfeat, h, w)')
 
-            self.noise.bernoulli_(1-self.p)
+            self.noise.bernoulli_(1 - self.p)
             # We expand the random dropouts to the entire feature map because the
             # features are likely correlated accross the map and so the dropout
             # should also be correlated.
             self.output.mul_(self.noise.expand_as(input))
         else:
-            self.output.mul_(1-self.p)
+            self.output.mul_(1 - self.p)
 
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         if self.train:
             self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            self.gradInput.mul_(self.noise.expand_as(input)) # simply mask the gradients with the noise vector
+            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
         else:
             raise RuntimeError('backprop only defined while training')
 
@@ -46,4 +47,3 @@ class SpatialDropout(Module):
     def clearState(self):
         clear(self, 'noise')
         return super(SpatialDropout, self).clearState()
-
diff --git a/torch/legacy/nn/SpatialFractionalMaxPooling.py b/torch/legacy/nn/SpatialFractionalMaxPooling.py
index 9888b842de..9bc55b7bb2 100644
--- a/torch/legacy/nn/SpatialFractionalMaxPooling.py
+++ b/torch/legacy/nn/SpatialFractionalMaxPooling.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class SpatialFractionalMaxPooling(Module):
     # Usage:
     # nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
@@ -34,21 +35,21 @@ class SpatialFractionalMaxPooling(Module):
         self.indices = None
 
         if arg1 >= 1 and arg2 >= 1:
-           # Desired output size: the input tensor will determine the reduction
-           # ratio
-           self.outW = arg1
-           self.outH = arg2
-           self.ratioW = self.ratioH = None
+            # Desired output size: the input tensor will determine the reduction
+            # ratio
+            self.outW = arg1
+            self.outH = arg2
+            self.ratioW = self.ratioH = None
         else:
-           # Reduction ratio specified per each input
-           # This is the reduction ratio that we use
-           self.ratioW = arg1
-           self.ratioH = arg2
-           self.outW = self.outH = None
+            # Reduction ratio specified per each input
+            # This is the reduction ratio that we use
+            self.ratioW = arg1
+            self.ratioH = arg2
+            self.outW = self.outH = None
 
-           # The reduction ratio must be between 0 and 1
-           assert self.ratioW > 0 and self.ratioW < 1
-           assert self.ratioH > 0 and self.ratioH < 1
+            # The reduction ratio must be between 0 and 1
+            assert self.ratioW > 0 and self.ratioW < 1
+            assert self.ratioH > 0 and self.ratioH < 1
 
     def _getBufferSize(self, input):
         assert input.ndimension() == 4
@@ -57,7 +58,6 @@ class SpatialFractionalMaxPooling(Module):
 
         return torch.Size([batchSize, planeSize, 2])
 
-
     def _initSampleBuffer(self, input):
         sampleBufferSize = self._getBufferSize(input)
 
@@ -93,7 +93,7 @@ class SpatialFractionalMaxPooling(Module):
 
     def updateOutput(self, input):
         if self.indices is None:
-              self.indices = input.new()
+            self.indices = input.new()
         self.indices = self.indices.long()
         self._initSampleBuffer(input)
         outW, outH = self._getOutputSizes(input)
@@ -130,6 +130,6 @@ class SpatialFractionalMaxPooling(Module):
 
     def __repr__(self):
         return super(SpatialFractionalMaxPooling, self).__repr__() + \
-                '({}x{}, {}, {})'.format(self.outW or self.ratioW,
-                                        self.outH or self.ratioH,
-                                        self.poolSizeW, self.poolSizeH)
+            '({}x{}, {}, {})'.format(self.outW or self.ratioW,
+                                     self.outH or self.ratioH,
+                                     self.poolSizeW, self.poolSizeH)
diff --git a/torch/legacy/nn/SpatialFullConvolution.py b/torch/legacy/nn/SpatialFullConvolution.py
index 212ea4401c..a230dba646 100644
--- a/torch/legacy/nn/SpatialFullConvolution.py
+++ b/torch/legacy/nn/SpatialFullConvolution.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialFullConvolution(Module):
 
     def __init__(self, nInputPlane, nOutputPlane, kW, kH, dW=1, dH=1, padW=0, padH=None, adjW=0, adjH=0):
@@ -46,7 +47,7 @@ class SpatialFullConvolution(Module):
             nInputPlane = self.nInputPlane
             kH = self.kH
             kW = self.kW
-            stdv = 1/math.sqrt(kW*kH*nInputPlane)
+            stdv = 1 / math.sqrt(kW * kH * nInputPlane)
 
         self.weight.uniform_(-stdv, stdv)
         if self.bias is not None:
@@ -54,15 +55,15 @@ class SpatialFullConvolution(Module):
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
-           if self._input is None:
-                  self._input = input.new()
-           self._input.resize_as_(input).copy_(input)
-           input = self._input
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
                 if self._gradOutput is None:
-                      self._gradOutput = gradOutput.new()
+                    self._gradOutput = gradOutput.new()
                 self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                 gradOutput = self._gradOutput
             return input, gradOutput
@@ -82,20 +83,19 @@ class SpatialFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
             if self.finput is None:
-                  self.finput = input[0].new()
+                self.finput = input[0].new()
             if self.fgradInput is None:
-                  self.fgradInput = input[0].new()
+                self.fgradInput = input[0].new()
         else:
             if self.finput is None:
-                  self.finput = input.new()
+                self.finput = input.new()
             if self.fgradInput is None:
-                  self.fgradInput = input.new()
-
+                self.fgradInput = input.new()
 
         inputTensor = self._makeContiguous(inputTensor)
         self._backend.SpatialFullConvolution_updateOutput(
@@ -125,8 +125,8 @@ class SpatialFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
         # Momentarily extract the gradInput tensor
@@ -150,9 +150,9 @@ class SpatialFullConvolution(Module):
         if isinstance(input, list):
             # Create a zero tensor to be expanded and used as gradInput[1].
             if self.zeroScalar is None:
-                  self.zeroScalar = input[1].new(1).zero_()
+                self.zeroScalar = input[1].new(1).zero_()
             self.ones.resize_(input[1].dim()).fill_(1)
-            zeroTensor =  self.zeroScalar.view_as(self.ones).expand_as(input[1])
+            zeroTensor = self.zeroScalar.view_as(self.ones).expand_as(input[1])
             self.gradInput = [self.gradInput, zeroTensor]
 
         return self.gradInput
@@ -167,8 +167,8 @@ class SpatialFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
 
@@ -215,5 +215,3 @@ class SpatialFullConvolution(Module):
     def clearState(self):
         clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
         return super(SpatialFullConvolution, self).clearState()
-
-
diff --git a/torch/legacy/nn/SpatialFullConvolutionMap.py b/torch/legacy/nn/SpatialFullConvolutionMap.py
index 9e2309977e..b4981f3fd5 100644
--- a/torch/legacy/nn/SpatialFullConvolutionMap.py
+++ b/torch/legacy/nn/SpatialFullConvolutionMap.py
@@ -3,6 +3,7 @@ import math
 import torch
 from .Module import Module
 
+
 class SpatialFullConvolutionMap(Module):
 
     def __init__(self, conMatrix, kW, kH, dW=1, dH=1):
@@ -36,10 +37,10 @@ class SpatialFullConvolutionMap(Module):
                 ninp[idx] += 1
             for k in range(self.connTable.size(0)):
                 idx = int(self.connTable[k][1])
-                stdv = 1. / math.sqrt(self.kW*self.kH*ninp[idx])
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[idx])
                 self.weight[k].uniform_(-stdv, stdv)
             for k in range(self.bias.size(0)):
-                stdv = 1. / math.sqrt(self.kW*self.kH*ninp[k])
+                stdv = 1. / math.sqrt(self.kW * self.kH * ninp[k])
                 # TODO: torch.uniform
                 self.bias[k] = random.uniform(-stdv, stdv)
 
@@ -57,7 +58,6 @@ class SpatialFullConvolutionMap(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self._backend.SpatialFullConvolutionMap_updateGradInput(
             self._backend.library_state,
@@ -73,7 +73,6 @@ class SpatialFullConvolutionMap(Module):
         )
         return self.gradInput
 
-
     def accGradParameters(self, input, gradOutput, scale=1):
         self._backend.SpatialFullConvolutionMap_accGradParameters(
             self._backend.library_state,
@@ -87,4 +86,3 @@ class SpatialFullConvolutionMap(Module):
             self.dW, self.dH,
             scale
         )
-
diff --git a/torch/legacy/nn/SpatialLPPooling.py b/torch/legacy/nn/SpatialLPPooling.py
index 82b8f04a66..cf84593da1 100644
--- a/torch/legacy/nn/SpatialLPPooling.py
+++ b/torch/legacy/nn/SpatialLPPooling.py
@@ -7,6 +7,7 @@ from .SpatialAveragePooling import SpatialAveragePooling
 from .MulConstant import MulConstant
 from .Sqrt import Sqrt
 
+
 class SpatialLPPooling(Sequential):
 
     def __init__(self, nInputPlane, pnorm, kW, kH, dW=None, dH=None):
@@ -21,16 +22,16 @@ class SpatialLPPooling(Sequential):
         self.dH = dH
 
         if pnorm == 2:
-           self.add(Square())
+            self.add(Square())
         else:
-           self.add(Power(pnorm))
+            self.add(Power(pnorm))
 
         self.add(SpatialAveragePooling(kW, kH, dW, dH))
-        self.add(MulConstant(kW*kH))
+        self.add(MulConstant(kW * kH))
         if pnorm == 2:
-           self.add(Sqrt())
+            self.add(Sqrt())
         else:
-           self.add(Power(1./pnorm))
+            self.add(Power(1. / pnorm))
 
     # the module is a Sequential: by default, it'll try to learn the parameters
     # of the sub sampler: we avoid that by redefining its methods.
@@ -48,4 +49,3 @@ class SpatialLPPooling(Sequential):
 
     def updateParameters(self, learningRate):
         pass
-
diff --git a/torch/legacy/nn/SpatialMaxPooling.py b/torch/legacy/nn/SpatialMaxPooling.py
index d53b2e2ca2..83eca1f257 100644
--- a/torch/legacy/nn/SpatialMaxPooling.py
+++ b/torch/legacy/nn/SpatialMaxPooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class SpatialMaxPooling(Module):
 
     def __init__(self, kW, kH, dW=None, dH=None, padW=0, padH=0):
@@ -31,12 +32,12 @@ class SpatialMaxPooling(Module):
 
     def updateOutput(self, input):
         if self.indices is None:
-              self.indices = input.new()
+            self.indices = input.new()
         self.indices = self.indices.long()
 
         dims = input.dim()
-        self.iheight = input.size(dims-2)
-        self.iwidth = input.size(dims-1)
+        self.iheight = input.size(dims - 2)
+        self.iwidth = input.size(dims - 1)
 
         self._backend.SpatialMaxPooling_updateOutput(
             self._backend.library_state,
diff --git a/torch/legacy/nn/SpatialMaxUnpooling.py b/torch/legacy/nn/SpatialMaxUnpooling.py
index 789046bb7a..477ef43124 100644
--- a/torch/legacy/nn/SpatialMaxUnpooling.py
+++ b/torch/legacy/nn/SpatialMaxUnpooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .SpatialMaxPooling import SpatialMaxPooling
 
+
 class SpatialMaxUnpooling(Module):
 
     def __init__(self, poolingModule):
@@ -41,4 +42,3 @@ class SpatialMaxUnpooling(Module):
 
     def __repr__(self):
         return 'nn.SpatialMaxUnpooling associated to ' + self.pooling.__repr__()
-
diff --git a/torch/legacy/nn/SpatialReflectionPadding.py b/torch/legacy/nn/SpatialReflectionPadding.py
index d9f2cb32e8..b8f3d15ba3 100644
--- a/torch/legacy/nn/SpatialReflectionPadding.py
+++ b/torch/legacy/nn/SpatialReflectionPadding.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialReflectionPadding(Module):
 
     def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
@@ -24,9 +25,9 @@ class SpatialReflectionPadding(Module):
     def updateGradInput(self, input, gradOutput):
         assert input.dim() == 4 and gradOutput.dim() == 4
         assert input.size(0) == gradOutput.size(0) and \
-                input.size(1) == gradOutput.size(1) and \
-                input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
-                input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
+            input.size(1) == gradOutput.size(1) and \
+            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
+            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
 
         self._backend.SpatialReflectionPadding_updateGradInput(
             self._backend.library_state,
@@ -41,4 +42,3 @@ class SpatialReflectionPadding(Module):
         s = super(SpatialReflectionPadding, self).__repr__()
         s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
         return s
-
diff --git a/torch/legacy/nn/SpatialReplicationPadding.py b/torch/legacy/nn/SpatialReplicationPadding.py
index 340f74d66b..67a79a965f 100644
--- a/torch/legacy/nn/SpatialReplicationPadding.py
+++ b/torch/legacy/nn/SpatialReplicationPadding.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialReplicationPadding(Module):
 
     def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
@@ -24,9 +25,9 @@ class SpatialReplicationPadding(Module):
     def updateGradInput(self, input, gradOutput):
         assert input.dim() == 4 and gradOutput.dim() == 4
         assert input.size(0) == gradOutput.size(0) and \
-                input.size(1) == gradOutput.size(1) and \
-                input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
-                input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
+            input.size(1) == gradOutput.size(1) and \
+            input.size(2) + self.pad_t + self.pad_b == gradOutput.size(2) and \
+            input.size(3) + self.pad_l + self.pad_r == gradOutput.size(3)
 
         self._backend.SpatialReplicationPadding_updateGradInput(
             self._backend.library_state,
@@ -42,4 +43,3 @@ class SpatialReplicationPadding(Module):
         s = super(SpatialReplicationPadding, self).__repr__()
         s += '({}, {}, {}, {})'.format(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
         return s
-
diff --git a/torch/legacy/nn/SpatialSoftMax.py b/torch/legacy/nn/SpatialSoftMax.py
index 7e2341a226..526e6d47dc 100644
--- a/torch/legacy/nn/SpatialSoftMax.py
+++ b/torch/legacy/nn/SpatialSoftMax.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialSoftMax(Module):
 
     def updateOutput(self, input):
@@ -11,7 +12,6 @@ class SpatialSoftMax(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self._backend.SoftMax_updateGradInput(
             self._backend.library_state,
@@ -21,4 +21,3 @@ class SpatialSoftMax(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/SpatialSubSampling.py b/torch/legacy/nn/SpatialSubSampling.py
index 1b2f7e9fd9..2429800f07 100644
--- a/torch/legacy/nn/SpatialSubSampling.py
+++ b/torch/legacy/nn/SpatialSubSampling.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class SpatialSubSampling(Module):
 
     def __init__(self, nInputPlane, kW, kH, dW=1, dH=1):
@@ -20,12 +21,11 @@ class SpatialSubSampling(Module):
 
         self.reset()
 
-
     def reset(self, stdv=None):
         if stdv is not None:
             stdv = stdv * math.sqrt(3)
         else:
-            stdv = 1. / math.sqrt(self.kW*self.kH)
+            stdv = 1. / math.sqrt(self.kW * self.kH)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
@@ -42,7 +42,6 @@ class SpatialSubSampling(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
             return
@@ -58,8 +57,6 @@ class SpatialSubSampling(Module):
         )
         return self.gradInput
 
-
-
     def accGradParameters(self, input, gradOutput, scale=1):
         self._backend.SpatialSubSampling_accGradParameters(
             self._backend.library_state,
@@ -71,4 +68,3 @@ class SpatialSubSampling(Module):
             self.dW, self.dH,
             scale
         )
-
diff --git a/torch/legacy/nn/SpatialSubtractiveNormalization.py b/torch/legacy/nn/SpatialSubtractiveNormalization.py
index 0e68558dd7..2685bb4705 100644
--- a/torch/legacy/nn/SpatialSubtractiveNormalization.py
+++ b/torch/legacy/nn/SpatialSubtractiveNormalization.py
@@ -10,6 +10,7 @@ from .CSubTable import CSubTable
 from .CDivTable import CDivTable
 from .utils import clear
 
+
 class SpatialSubtractiveNormalization(Module):
 
     def __init__(self, nInputPlane=1, kernel=None):
@@ -24,19 +25,19 @@ class SpatialSubtractiveNormalization(Module):
 
         # check args
         if kdim != 2 and kdim != 1:
-           raise ValueError('SpatialSubtractiveNormalization averaging kernel must be 2D or 1D')
+            raise ValueError('SpatialSubtractiveNormalization averaging kernel must be 2D or 1D')
 
         if (self.kernel.size(0) % 2) == 0 or (kdim == 2 and (self.kernel.size(1) % 2) == 0):
-           raise ValueError('SpatialSubtractiveNormalization averaging kernel must have ODD dimensions')
+            raise ValueError('SpatialSubtractiveNormalization averaging kernel must have ODD dimensions')
 
         # normalize kernel
         self.kernel.div_(self.kernel.sum() * self.nInputPlane)
 
         # padding values
-        padH = int(math.floor(self.kernel.size(0)/2))
+        padH = int(math.floor(self.kernel.size(0) / 2))
         padW = padH
         if kdim == 2:
-           padW = int(math.floor(self.kernel.size(1)/2))
+            padW = int(math.floor(self.kernel.size(1) / 2))
 
         # create convolutional mean extractor
         self.meanestimator = Sequential()
@@ -45,7 +46,8 @@ class SpatialSubtractiveNormalization(Module):
             self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, self.kernel.size(1), self.kernel.size(0)))
         else:
             # TODO: map
-            self.meanestimator.add(SpatialConvolutionMap(SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
+            self.meanestimator.add(SpatialConvolutionMap(
+                SpatialConvolutionMap.maps.oneToOne(self.nInputPlane), self.kernel.size(0), 1))
             self.meanestimator.add(SpatialConvolution(self.nInputPlane, 1, 1, self.kernel.size(0)))
 
         self.meanestimator.add(Replicate(self.nInputPlane, 0))
@@ -76,7 +78,7 @@ class SpatialSubtractiveNormalization(Module):
     def updateOutput(self, input):
         # compute side coefficients
         dim = input.dim()
-        if input.dim() + 1 != self.coef.dim() or (input.size(dim-1) != self.coef.size(dim-1)) or (input.size(dim-2) != self.coef.size(dim-2)):
+        if input.dim() + 1 != self.coef.dim() or (input.size(dim - 1) != self.coef.size(dim - 1)) or (input.size(dim - 2) != self.coef.size(dim - 2)):
             if self.ones is None:
                 self.ones = input.new()
             if self._coef is None:
@@ -84,7 +86,7 @@ class SpatialSubtractiveNormalization(Module):
 
             self.ones.resize_as_(input[0:1]).fill_(1)
             coef = self.meanestimator.updateOutput(self.ones).squeeze(0)
-            self._coef.resize_as_(coef).copy_(coef) # make contiguous for view
+            self._coef.resize_as_(coef).copy_(coef)  # make contiguous for view
             size = list(coef.size())
             size = [input.size(0)] + size
             self.coef = self._coef.view(1, *self._coef.size()).expand(*size)
diff --git a/torch/legacy/nn/SpatialUpSamplingNearest.py b/torch/legacy/nn/SpatialUpSamplingNearest.py
index 4fcf9ab0bb..11d6524be5 100644
--- a/torch/legacy/nn/SpatialUpSamplingNearest.py
+++ b/torch/legacy/nn/SpatialUpSamplingNearest.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialUpSamplingNearest(Module):
     """
     Applies a 2D up-sampling over an input image composed of several input planes.
@@ -41,7 +42,6 @@ class SpatialUpSamplingNearest(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         self.gradInput.resize_as_(input)
         self._backend.SpatialUpSamplingNearest_updateGradInput(
diff --git a/torch/legacy/nn/SpatialZeroPadding.py b/torch/legacy/nn/SpatialZeroPadding.py
index 18d6783082..a97d151336 100644
--- a/torch/legacy/nn/SpatialZeroPadding.py
+++ b/torch/legacy/nn/SpatialZeroPadding.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SpatialZeroPadding(Module):
 
     def __init__(self, pad_l, pad_r=None, pad_t=None, pad_b=None):
@@ -78,4 +79,3 @@ class SpatialZeroPadding(Module):
         s = super(SpatialZeroPadding, self).__repr__()
         s += '({}, {}, {}, {})'.foramat(self.pad_l, self.pad_r, self.pad_t, self.pad_b)
         return s
-
diff --git a/torch/legacy/nn/SplitTable.py b/torch/legacy/nn/SplitTable.py
index 6f2f12ed46..c93079d574 100644
--- a/torch/legacy/nn/SplitTable.py
+++ b/torch/legacy/nn/SplitTable.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class SplitTable(Module):
 
     def __init__(self, dimension):
@@ -10,7 +11,7 @@ class SplitTable(Module):
     def _getPositiveDimension(self, input):
         dimension = self.dimension
         if dimension < 0:
-           dimension = input.dim() + dimension
+            dimension = input.dim() + dimension
 
         return dimension
 
@@ -36,4 +37,3 @@ class SplitTable(Module):
             self.gradInput.select(dimension, i).copy_(gradOutput[i])
 
         return self.gradInput
-
diff --git a/torch/legacy/nn/Sqrt.py b/torch/legacy/nn/Sqrt.py
index f68d93e3fe..e046594be2 100644
--- a/torch/legacy/nn/Sqrt.py
+++ b/torch/legacy/nn/Sqrt.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Sqrt(Module):
 
     def __init__(self, b=0, eps=0):
@@ -26,4 +27,3 @@ class Sqrt(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/Square.py b/torch/legacy/nn/Square.py
index ec05c19377..9ebaa371ed 100644
--- a/torch/legacy/nn/Square.py
+++ b/torch/legacy/nn/Square.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Square(Module):
 
     def updateOutput(self, input):
@@ -19,4 +20,3 @@ class Square(Module):
             self.gradInput
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/Squeeze.py b/torch/legacy/nn/Squeeze.py
index a9b1372c8d..2a4578f1f7 100644
--- a/torch/legacy/nn/Squeeze.py
+++ b/torch/legacy/nn/Squeeze.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Squeeze(Module):
 
     def __init__(self, dim=None):
@@ -12,9 +13,7 @@ class Squeeze(Module):
         self.output.set_(input.squeeze(dim) if dim is not None else input.squeeze())
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         assert input.nelement() == gradOutput.nelement()
         self.gradInput.set_(gradOutput.view_as(input))
         return self.gradInput
-
diff --git a/torch/legacy/nn/Sum.py b/torch/legacy/nn/Sum.py
index df89552bf7..84adb89b6e 100644
--- a/torch/legacy/nn/Sum.py
+++ b/torch/legacy/nn/Sum.py
@@ -37,7 +37,7 @@ class Sum(Module):
         size[dimension] = 1
         if not gradOutput.is_contiguous():
             if self._gradOutput is None:
-                  self._gradOutput = gradOutput.new()
+                self._gradOutput = gradOutput.new()
             self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
             gradOutput = self._gradOutput
 
diff --git a/torch/legacy/nn/Tanh.py b/torch/legacy/nn/Tanh.py
index 57e524e44f..bcee876b38 100644
--- a/torch/legacy/nn/Tanh.py
+++ b/torch/legacy/nn/Tanh.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Tanh(Module):
 
     def updateOutput(self, input):
@@ -20,4 +21,3 @@ class Tanh(Module):
             self.output
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/TanhShrink.py b/torch/legacy/nn/TanhShrink.py
index 2faf98b8f6..36ef0f1689 100644
--- a/torch/legacy/nn/TanhShrink.py
+++ b/torch/legacy/nn/TanhShrink.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .Tanh import Tanh
 
+
 class TanhShrink(Module):
 
     def __init__(self):
@@ -19,4 +20,3 @@ class TanhShrink(Module):
         self.gradInput.resize_as_(input).copy_(gradOutput)
         self.gradInput.add_(-1, dth)
         return self.gradInput
-
diff --git a/torch/legacy/nn/TemporalConvolution.py b/torch/legacy/nn/TemporalConvolution.py
index 3b9383e710..4ac04f264e 100644
--- a/torch/legacy/nn/TemporalConvolution.py
+++ b/torch/legacy/nn/TemporalConvolution.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class TemporalConvolution(Module):
 
     def __init__(self, inputFrameSize, outputFrameSize, kW, dW=1):
@@ -12,18 +13,18 @@ class TemporalConvolution(Module):
         self.kW = kW
         self.dW = dW
 
-        self.weight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+        self.weight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
         self.bias = torch.Tensor(outputFrameSize)
-        self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+        self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize * kW)
         self.gradBias = torch.Tensor(outputFrameSize)
 
         self.reset()
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.kW*self.inputFrameSize)
+            stdv = 1. / math.sqrt(self.kW * self.inputFrameSize)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
@@ -67,4 +68,3 @@ class TemporalConvolution(Module):
             self.dW,
             scale
         )
-
diff --git a/torch/legacy/nn/TemporalMaxPooling.py b/torch/legacy/nn/TemporalMaxPooling.py
index 01ad2e3e69..d3088ca4f3 100644
--- a/torch/legacy/nn/TemporalMaxPooling.py
+++ b/torch/legacy/nn/TemporalMaxPooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class TemporalMaxPooling(Module):
 
     def __init__(self, kW, dW=None):
@@ -12,7 +13,7 @@ class TemporalMaxPooling(Module):
 
     def updateOutput(self, input):
         if self.indices is None:
-              self.indices = input.new()
+            self.indices = input.new()
         self._backend.TemporalMaxPooling_updateOutput(
             self._backend.library_state,
             input,
@@ -23,10 +24,9 @@ class TemporalMaxPooling(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
-             return
+            return
         self._backend.TemporalMaxPooling_updateGradInput(
             self._backend.library_state,
             input,
@@ -41,4 +41,3 @@ class TemporalMaxPooling(Module):
     def clearState(self):
         clear(self, 'indices')
         return super(TemporalMaxPooling, self).clearState()
-
diff --git a/torch/legacy/nn/TemporalSubSampling.py b/torch/legacy/nn/TemporalSubSampling.py
index af662f4751..823070bbcc 100644
--- a/torch/legacy/nn/TemporalSubSampling.py
+++ b/torch/legacy/nn/TemporalSubSampling.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class TemporalSubSampling(Module):
 
     def __init__(self, inputFrameSize, kW, dW=1):
@@ -20,9 +21,9 @@ class TemporalSubSampling(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.kW)
+            stdv = 1. / math.sqrt(self.kW)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
@@ -40,7 +41,6 @@ class TemporalSubSampling(Module):
         )
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
             return
@@ -66,4 +66,3 @@ class TemporalSubSampling(Module):
             self.dW,
             scale
         )
-
diff --git a/torch/legacy/nn/Threshold.py b/torch/legacy/nn/Threshold.py
index 178cb8f27a..f151d023e8 100644
--- a/torch/legacy/nn/Threshold.py
+++ b/torch/legacy/nn/Threshold.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Threshold(Module):
 
     def __init__(self, threshold=0, value=0, inplace=False):
@@ -41,7 +42,4 @@ class Threshold(Module):
         if self.inplace:
             if self.value > self.threshold:
                 raise RuntimeError('in-place processing requires value ({}) to not '
-                    'exceed threshold ({})'.format(self.value, self.threshold))
-
-
-
+                                   'exceed threshold ({})'.format(self.value, self.threshold))
diff --git a/torch/legacy/nn/Transpose.py b/torch/legacy/nn/Transpose.py
index 70bf068758..4478c251e0 100644
--- a/torch/legacy/nn/Transpose.py
+++ b/torch/legacy/nn/Transpose.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class Transpose(Module):
     # transpose dimensions:
     # n = nn.Transpose({1, 4}, {1, 3})
@@ -12,14 +13,12 @@ class Transpose(Module):
 
     def updateOutput(self, input):
         for perm in self.permutations:
-           input = input.transpose(*perm)
+            input = input.transpose(*perm)
         self.output.resize_as_(input).copy_(input)
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         for perm in self.permutations[::-1]:
-           gradOutput = gradOutput.transpose(*perm)
+            gradOutput = gradOutput.transpose(*perm)
         self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
         return self.gradInput
-
-
diff --git a/torch/legacy/nn/Unsqueeze.py b/torch/legacy/nn/Unsqueeze.py
index 089a07113e..a78ac3022c 100644
--- a/torch/legacy/nn/Unsqueeze.py
+++ b/torch/legacy/nn/Unsqueeze.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import addSingletondimension
 
+
 class Unsqueeze(Module):
 
     def __init__(self, dim):
@@ -19,4 +20,3 @@ class Unsqueeze(Module):
 
     def __repr__(self):
         return super(Unsqueeze, self).__repr__() + '({})'.format(self.dim)
-
diff --git a/torch/legacy/nn/View.py b/torch/legacy/nn/View.py
index 7bcef91f34..d228fb15a0 100644
--- a/torch/legacy/nn/View.py
+++ b/torch/legacy/nn/View.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class View(Module):
 
     def resetSize(self, *args):
@@ -28,14 +29,13 @@ class View(Module):
 
     def updateOutput(self, input):
         if self.output is None:
-              self.output = input.new()
+            self.output = input.new()
         self.output = input.view(self.size)
         return self.output
 
-
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
-              self.gradInput = gradOutput.new()
+            self.gradInput = gradOutput.new()
         self.gradInput = gradOutput.view(input.size())
         return self.gradInput
 
diff --git a/torch/legacy/nn/VolumetricAveragePooling.py b/torch/legacy/nn/VolumetricAveragePooling.py
index e6190b9ec0..a89ba4cade 100644
--- a/torch/legacy/nn/VolumetricAveragePooling.py
+++ b/torch/legacy/nn/VolumetricAveragePooling.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class VolumetricAveragePooling(Module):
 
     def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None):
@@ -38,4 +39,3 @@ class VolumetricAveragePooling(Module):
         s += '({}x{}x{}, {}, {}, {}'.format(self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
         s += ')'
         return s
-
diff --git a/torch/legacy/nn/VolumetricBatchNormalization.py b/torch/legacy/nn/VolumetricBatchNormalization.py
index 2ebd14c359..61bab4c6ef 100644
--- a/torch/legacy/nn/VolumetricBatchNormalization.py
+++ b/torch/legacy/nn/VolumetricBatchNormalization.py
@@ -2,5 +2,6 @@ import torch
 from .Module import Module
 from .BatchNormalization import BatchNormalization
 
+
 class VolumetricBatchNormalization(BatchNormalization):
     nDim = 5
diff --git a/torch/legacy/nn/VolumetricConvolution.py b/torch/legacy/nn/VolumetricConvolution.py
index c28f21ba30..a4060d95a4 100644
--- a/torch/legacy/nn/VolumetricConvolution.py
+++ b/torch/legacy/nn/VolumetricConvolution.py
@@ -3,6 +3,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class VolumetricConvolution(Module):
 
     def __init__(self, nInputPlane, nOutputPlane, kT, kW, kH, dT=1, dW=1, dH=1, padT=0, padW=None, padH=None):
@@ -31,24 +32,24 @@ class VolumetricConvolution(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.kT*self.kW*self.kH*self.nInputPlane)
+            stdv = 1. / math.sqrt(self.kT * self.kW * self.kH * self.nInputPlane)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
-           if self._input is None:
-                  self._input = input.new()
-           self._input.resize_as_(input).copy_(input)
-           input = self._input
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
                 if self._gradOutput is None:
-                      self._gradOutput = gradOutput.new()
+                    self._gradOutput = gradOutput.new()
                 self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                 gradOutput = self._gradOutput
             return input, gradOutput
@@ -68,9 +69,9 @@ class VolumetricConvolution(Module):
 
     def updateOutput(self, input):
         if self.finput is None:
-              self.finput = input.new()
+            self.finput = input.new()
         if self.fgradInput is None:
-              self.fgradInput = input.new()
+            self.fgradInput = input.new()
         if input.type() == 'torch.cuda.FloatTensor':
             self._backend.VolumetricConvolution_updateOutput(
                 self._backend.library_state,
@@ -178,10 +179,10 @@ class VolumetricConvolution(Module):
         s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
         if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
            self.padT != 0 or self.padW != 0 or self.padH != 0:
-               s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
+            s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
 
         if self.padT != 0 or self.padW != 0 or self.padH != 0:
-               s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
+            s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
 
         s += ')'
         return s
diff --git a/torch/legacy/nn/VolumetricDropout.py b/torch/legacy/nn/VolumetricDropout.py
index dda60c82f7..4d3c244a10 100644
--- a/torch/legacy/nn/VolumetricDropout.py
+++ b/torch/legacy/nn/VolumetricDropout.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class VolumetricDropout(Module):
 
     def __init__(self, p=0.5):
@@ -16,20 +17,20 @@ class VolumetricDropout(Module):
             assert input.dim() == 5
             self.noise.resize_(input.size(0), input.size(1), 1, 1, 1)
 
-            self.noise.bernoulli_(1-self.p)
+            self.noise.bernoulli_(1 - self.p)
             # We expand the random dropouts to the entire feature map because the
             # features are likely correlated accross the map and so the dropout
             # should also be correlated.
             self.output.mul_(self.noise.expand_as(input))
         else:
-            self.output.mul_(1-self.p)
+            self.output.mul_(1 - self.p)
 
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         if self.train:
             self.gradInput.resize_as_(gradOutput).copy_(gradOutput)
-            self.gradInput.mul_(self.noise.expand_as(input)) # simply mask the gradients with the noise vector
+            self.gradInput.mul_(self.noise.expand_as(input))  # simply mask the gradients with the noise vector
         else:
             raise RuntimeError('backprop only defined while training')
 
@@ -44,4 +45,3 @@ class VolumetricDropout(Module):
     def clearState(self):
         clear(self, 'noise')
         return super(VolumetricDropout, self).clearState()
-
diff --git a/torch/legacy/nn/VolumetricFullConvolution.py b/torch/legacy/nn/VolumetricFullConvolution.py
index 84ee984471..5f5fd01552 100644
--- a/torch/legacy/nn/VolumetricFullConvolution.py
+++ b/torch/legacy/nn/VolumetricFullConvolution.py
@@ -2,13 +2,14 @@ import math
 import torch
 from .Module import Module
 
+
 class VolumetricFullConvolution(Module):
 
     def __init__(self, nInputPlane, nOutputPlane,
-            kT, kW, kH,                 # kernel size
-            dT=1, dW=1, dH=1,           # stride
-            padT=0, padW=0, padH=0,     # padding
-            adjT=0, adjW=0, adjH=0):    # extra output adjustment
+                 kT, kW, kH,                 # kernel size
+                 dT=1, dW=1, dH=1,           # stride
+                 padT=0, padW=0, padH=0,     # padding
+                 adjT=0, adjW=0, adjH=0):    # extra output adjustment
         super(VolumetricFullConvolution, self).__init__()
 
         self.nInputPlane = nInputPlane
@@ -28,7 +29,7 @@ class VolumetricFullConvolution(Module):
 
         if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 or self.adjT > self.dT - 1:
             raise RuntimeError('adjW, adjH and adjT must be smaller than self.dW - 1, '
-                    ' self.dH - 1 and self.dT - 1 respectively')
+                               ' self.dH - 1 and self.dT - 1 respectively')
 
         self.weight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
         self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
@@ -41,7 +42,6 @@ class VolumetricFullConvolution(Module):
 
         self.reset()
 
-
     def reset(self, stdv=None):
         if stdv is not None:
             stdv = stdv * math.sqrt(3)
@@ -50,22 +50,22 @@ class VolumetricFullConvolution(Module):
             kT = self.kT
             kH = self.kH
             kW = self.kW
-            stdv = 1. / math.sqrt(kW*kH*kT*nInputPlane)
+            stdv = 1. / math.sqrt(kW * kH * kT * nInputPlane)
 
         self.weight.uniform_(-stdv, stdv)
         self.bias.uniform_(-stdv, stdv)
 
     def _makeContiguous(self, input, gradOutput=None):
         if not input.is_contiguous():
-           if self._input is None:
-                  self._input = input.new()
-           self._input.resize_as_(input).copy_(input)
-           input = self._input
+            if self._input is None:
+                self._input = input.new()
+            self._input.resize_as_(input).copy_(input)
+            input = self._input
 
         if gradOutput is not None:
             if not gradOutput.is_contiguous():
                 if self._gradOutput is None:
-                      self._gradOutput = gradOutput.new()
+                    self._gradOutput = gradOutput.new()
                 self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
                 gradOutput = self._gradOutput
             return input, gradOutput
@@ -85,9 +85,9 @@ class VolumetricFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims-3)
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
             adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
@@ -118,9 +118,9 @@ class VolumetricFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims-3)
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
             adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
@@ -145,9 +145,9 @@ class VolumetricFullConvolution(Module):
         if isinstance(input, list):
             # Create a zero tensor to be expanded and used as gradInput[1].
             if self.zeroScalar is None:
-                  self.zeroScalar = input[1].new(1).zero_()
+                self.zeroScalar = input[1].new(1).zero_()
             self.ones.resize_(input[1].dim()).fill_(1)
-            zeroTensor =  self.zeroScalar.view(self.ones.tolist()).expand_as(input[1])
+            zeroTensor = self.zeroScalar.view(self.ones.tolist()).expand_as(input[1])
             self.gradInput = [self.gradInput, zeroTensor]
 
         return self.gradInput
@@ -162,9 +162,9 @@ class VolumetricFullConvolution(Module):
             inputTensor = input[0]
             targetTensor = input[1]
             tDims = targetTensor.dim()
-            tT = targetTensor.size(tDims-3)
-            tH = targetTensor.size(tDims-2)
-            tW = targetTensor.size(tDims-1)
+            tT = targetTensor.size(tDims - 3)
+            tH = targetTensor.size(tDims - 2)
+            tW = targetTensor.size(tDims - 1)
             adjT = self._calculateAdj(tT, self.kT, self.padT, self.dT)
             adjW = self._calculateAdj(tW, self.kW, self.padW, self.dW)
             adjH = self._calculateAdj(tH, self.kH, self.padH, self.dH)
@@ -193,12 +193,12 @@ class VolumetricFullConvolution(Module):
         s = super(VolumetricFullConvolution, self).__repr__()
         s += '({} -> {}, {}x{}x{}'.format(self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
         if self.dT != 1 or self.dW != 1 or self.dH != 1 or \
-            self.padT != 0 or self.padW != 0 or self.padH != 0 or \
-            self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
+                self.padT != 0 or self.padW != 0 or self.padH != 0 or \
+                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
             s += ', {}, {}, {}'.format(self.dT, self.dW, self.dH)
 
         if self.padT != 0 or self.padW != 0 or self.padH != 0 or \
-            self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
+                self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
             s += ', {}, {}, {}'.format(self.padT, self.padW, self.padH)
 
         if self.adjT != 0 or self.adjW != 0 or self.adjH != 0:
@@ -206,4 +206,3 @@ class VolumetricFullConvolution(Module):
 
         s += ')'
         return s
-
diff --git a/torch/legacy/nn/VolumetricMaxPooling.py b/torch/legacy/nn/VolumetricMaxPooling.py
index edac36fb18..823ab05846 100644
--- a/torch/legacy/nn/VolumetricMaxPooling.py
+++ b/torch/legacy/nn/VolumetricMaxPooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .utils import clear
 
+
 class VolumetricMaxPooling(Module):
 
     def __init__(self, kT, kW, kH, dT=None, dW=None, dH=None, padT=0, padW=0, padH=0):
@@ -22,21 +23,21 @@ class VolumetricMaxPooling(Module):
         self.indices = torch.LongTensor()
 
     def ceil(self):
-         self.ceil_mode = True
-         return self
+        self.ceil_mode = True
+        return self
 
     def floor(self):
-         self.ceil_mode = False
-         return self
+        self.ceil_mode = False
+        return self
 
     def updateOutput(self, input):
         dims = input.dim()
-        self.itime = input.size(dims-3)
-        self.iheight = input.size(dims-2)
-        self.iwidth = input.size(dims-1)
+        self.itime = input.size(dims - 3)
+        self.iheight = input.size(dims - 2)
+        self.iwidth = input.size(dims - 1)
 
         if self.indices is None:
-              self.indices = input.new()
+            self.indices = input.new()
         self.indices = self.indices.long()
         self._backend.VolumetricMaxPooling_updateOutput(
             self._backend.library_state,
diff --git a/torch/legacy/nn/VolumetricMaxUnpooling.py b/torch/legacy/nn/VolumetricMaxUnpooling.py
index 45432ccb98..4de3b52607 100644
--- a/torch/legacy/nn/VolumetricMaxUnpooling.py
+++ b/torch/legacy/nn/VolumetricMaxUnpooling.py
@@ -2,6 +2,7 @@ import torch
 from .Module import Module
 from .VolumetricMaxPooling import VolumetricMaxPooling
 
+
 class VolumetricMaxUnpooling(Module):
 
     def __init__(self, poolingModule):
@@ -53,4 +54,3 @@ class VolumetricMaxUnpooling(Module):
 
     def __repr__(self):
         return 'nn.VolumetricMaxUnpooling associated to ' + self.pooling.__repr__()
-
diff --git a/torch/legacy/nn/VolumetricReplicationPadding.py b/torch/legacy/nn/VolumetricReplicationPadding.py
index b43d42c396..16cc7a1c09 100644
--- a/torch/legacy/nn/VolumetricReplicationPadding.py
+++ b/torch/legacy/nn/VolumetricReplicationPadding.py
@@ -1,6 +1,7 @@
 import torch
 from .Module import Module
 
+
 class VolumetricReplicationPadding(Module):
 
     def __init__(self, pleft, pright=None, ptop=None, pbottom=None, pfront=None, pback=None):
@@ -48,8 +49,7 @@ class VolumetricReplicationPadding(Module):
     def __repr__(self):
         s = super(VolumetricReplicationPadding, self).__repr__()
         s += '({}, {}, {}, {}, {}, {})'.format(self.pleft, self.pright,
-            self.ptop, self.pbottom,
-            self.pfront, self.pback
-        )
+                                               self.ptop, self.pbottom,
+                                               self.pfront, self.pback
+                                               )
         return s
-
diff --git a/torch/legacy/nn/WeightedEuclidean.py b/torch/legacy/nn/WeightedEuclidean.py
index 38256f1137..f171bbf020 100644
--- a/torch/legacy/nn/WeightedEuclidean.py
+++ b/torch/legacy/nn/WeightedEuclidean.py
@@ -2,6 +2,7 @@ import math
 import torch
 from .Module import Module
 
+
 class WeightedEuclidean(Module):
 
     def __init__(self, inputSize, outputSize):
@@ -36,40 +37,40 @@ class WeightedEuclidean(Module):
 
     def reset(self, stdv=None):
         if stdv is not None:
-           stdv = stdv * math.sqrt(3)
+            stdv = stdv * math.sqrt(3)
         else:
-           stdv = 1. / math.sqrt(self.weight.size(1))
+            stdv = 1. / math.sqrt(self.weight.size(1))
 
         self.weight.uniform_(-stdv, stdv)
         self.diagCov.fill_(1)
 
     def _view(self, res, src, *args):
         if src.is_contiguous():
-           res.set_(src.view(*args))
+            res.set_(src.view(*args))
         else:
-           res.set_(src.contiguous().view(*args))
+            res.set_(src.contiguous().view(*args))
 
     def updateOutput(self, input):
         # lazy-initialize
         if self._diagCov is None:
-              self._diagCov = self.output.new()
+            self._diagCov = self.output.new()
 
         if self._input is None:
-              self._input = input.new()
+            self._input = input.new()
         if self._weight is None:
-              self._weight = self.weight.new()
+            self._weight = self.weight.new()
         if self._expand is None:
-              self._expand = self.output.new()
+            self._expand = self.output.new()
         if self._expand2 is None:
-              self._expand2 = self.output.new()
+            self._expand2 = self.output.new()
         if self._expand3 is None:
-              self._expand3 = self.output.new()
+            self._expand3 = self.output.new()
         if self._repeat is None:
-              self._repeat = self.output.new()
+            self._repeat = self.output.new()
         if self._repeat2 is None:
-              self._repeat2 = self.output.new()
+            self._repeat2 = self.output.new()
         if self._repeat3 is None:
-              self._repeat3 = self.output.new()
+            self._repeat3 = self.output.new()
 
         inputSize, outputSize = self.weight.size(0), self.weight.size(1)
 
@@ -106,29 +107,28 @@ class WeightedEuclidean(Module):
                 self._repeat.add_(-1, self._expand2)
                 self._repeat.mul_(self._expand3)
 
-
             torch.norm(self._repeat, 2, 1, out=self.output)
             self.output.resize_(batchSize, outputSize)
         else:
-           raise RuntimeError("1D or 2D input expected")
+            raise RuntimeError("1D or 2D input expected")
 
         return self.output
 
     def updateGradInput(self, input, gradOutput):
         if self.gradInput is None:
-           return
+            return
 
         if self._div is None:
-              self._div = input.new()
+            self._div = input.new()
         if self._output is None:
-              self._output = self.output.new()
+            self._output = self.output.new()
         if self._expand4 is None:
-              self._expand4 = input.new()
+            self._expand4 = input.new()
         if self._gradOutput is None:
-              self._gradOutput = input.new()
+            self._gradOutput = input.new()
 
         if not self.fastBackward:
-           self.updateOutput(input)
+            self.updateOutput(input)
 
         inputSize, outputSize = self.weight.size(0), self.weight.size(1)
 
@@ -169,7 +169,6 @@ class WeightedEuclidean(Module):
                 torch.mul(self._repeat, self._expand4, out=self._repeat2)
                 self._repeat2.mul_(self._expand3)
 
-
             torch.sum(self._repeat2, 2, out=self.gradInput)
             self.gradInput.resize_as_(input)
         else:
@@ -203,11 +202,10 @@ class WeightedEuclidean(Module):
             else:
                 torch.mul(self._repeat, self._expand4, out=self._repeat2)
 
-
             self.gradDiagCov.add_(self._repeat2)
         elif input.dim() == 2:
             if self._sum is None:
-                  self._sum = input.new()
+                self._sum = input.new()
             torch.sum(self._repeat2, 0, out=self._sum)
             self._sum.resize_(inputSize, outputSize)
             self.gradWeight.add_(-scale, self._sum)
@@ -225,7 +223,6 @@ class WeightedEuclidean(Module):
                 self._repeat.mul_(self._expand3)
                 self._repeat.mul_(self._expand4)
 
-
             torch.sum(self._repeat, 0, out=self._sum)
             self._sum.resize_(inputSize, outputSize)
             self.gradDiagCov.add_(scale, self._sum)
@@ -261,4 +258,3 @@ class WeightedEuclidean(Module):
         self.accGradParameters(input, gradOutput, -lr)
         self.gradWeight = gradWeight
         self.gradDiagCov = gradDiagCov
-
diff --git a/torch/legacy/nn/WeightedMSECriterion.py b/torch/legacy/nn/WeightedMSECriterion.py
index 36d8f75dcc..eb1a4dee33 100644
--- a/torch/legacy/nn/WeightedMSECriterion.py
+++ b/torch/legacy/nn/WeightedMSECriterion.py
@@ -1,6 +1,7 @@
 import torch
 from .Criterion import Criterion
 
+
 class WeightedMSECriterion(Criterion):
 
     def __init__(self, weight, sizeAverage=True):
@@ -12,7 +13,7 @@ class WeightedMSECriterion(Criterion):
 
     def updateOutput(self, input, target):
         if self.buffer is None:
-              self.buffer = input.new()
+            self.buffer = input.new()
         self.buffer.resize_as_(input).copy_(target)
         if input.dim() - 1 == self.weight.dim():
             for i in range(input.size(0)):
@@ -21,7 +22,7 @@ class WeightedMSECriterion(Criterion):
             self.buffer.mul_(self.weight)
 
         if self.output_tensor is None:
-              self.output_tensor = input.new(1)
+            self.output_tensor = input.new(1)
         self._backend.MSECriterion_updateOutput(
             self._backend.library_state,
             input,
@@ -48,4 +49,3 @@ class WeightedMSECriterion(Criterion):
             self.sizeAverage
         )
         return self.gradInput
-
diff --git a/torch/legacy/nn/__init__.py b/torch/legacy/nn/__init__.py
index 929601ebdc..7e2507ac0a 100644
--- a/torch/legacy/nn/__init__.py
+++ b/torch/legacy/nn/__init__.py
@@ -80,7 +80,7 @@ from .PairwiseDistance import PairwiseDistance
 from .ParallelCriterion import ParallelCriterion
 from .PartialLinear import PartialLinear
 from .Power import Power
-from .RReLU import RReLU # TODO implement
+from .RReLU import RReLU  # TODO implement
 from .ReLU6 import ReLU6
 from .Replicate import Replicate
 from .Reshape import Reshape
diff --git a/torch/legacy/nn/utils.py b/torch/legacy/nn/utils.py
index 8a76117f03..0432a6e3a0 100644
--- a/torch/legacy/nn/utils.py
+++ b/torch/legacy/nn/utils.py
@@ -13,6 +13,8 @@ import torch
 # > net1:type('torch.cuda.FloatTensor', tensorCache)
 # > net2:type('torch.cuda.FloatTensor', tensorCache)
 # > nn.utils.recursiveType(anotherTensor, 'torch.cuda.FloatTensor', tensorCache)
+
+
 def recursiveType(param, type, tensorCache={}):
     from .Criterion import Criterion
     from .Module import Module
@@ -28,12 +30,13 @@ def recursiveType(param, type, tensorCache={}):
                 newparam = tensorCache[key]
             else:
                 newparam = torch.Tensor().type(type)
-                storageType = type.replace('Tensor','Storage')
+                storageType = type.replace('Tensor', 'Storage')
                 param_storage = param.storage()
                 if param_storage:
                     storage_key = param_storage._cdata
                     if storage_key not in tensorCache:
-                        tensorCache[storage_key] = torch._import_dotted_name(storageType)(param_storage.size()).copy_(param_storage)
+                        tensorCache[storage_key] = torch._import_dotted_name(
+                            storageType)(param_storage.size()).copy_(param_storage)
                     newparam.set_(
                         tensorCache[storage_key],
                         param.storage_offset(),
@@ -44,6 +47,7 @@ def recursiveType(param, type, tensorCache={}):
             param = newparam
     return param
 
+
 def recursiveResizeAs(t1, t2):
     if isinstance(t2, list):
         t1 = t1 if isinstance(t1, list) else [t1]
@@ -56,20 +60,22 @@ def recursiveResizeAs(t1, t2):
         t1 = t1 if torch.is_tensor(t1) else t2.new()
         t1.resize_as_(t2)
     else:
-        raise RuntimeError("Expecting nested tensors or tables. Got " + \
-                type(t1).__name__  + " and " + type(t2).__name__  + "instead")
+        raise RuntimeError("Expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + "instead")
     return t1, t2
 
+
 def recursiveFill(t2, val):
     if isinstance(t2, list):
         t2 = [recursiveFill(x, val) for x in t2]
     elif torch.is_tensor(t2):
         t2.fill_(val)
     else:
-        raise RuntimeError("expecting tensor or table thereof. Got " + \
-            type(t2).__name__ + " instead")
+        raise RuntimeError("expecting tensor or table thereof. Got " +
+                           type(t2).__name__ + " instead")
     return t2
 
+
 def recursiveAdd(t1, val=1, t2=None):
     if t2 is None:
         t2 = val
@@ -81,10 +87,11 @@ def recursiveAdd(t1, val=1, t2=None):
     elif torch.is_tensor(t1) and torch.is_tensor(t2):
         t1.add_(val, t2)
     else:
-        raise RuntimeError("expecting nested tensors or tables. Got " + \
-                type(t1).__name__ + " and " + type(t2).__name__ + " instead")
+        raise RuntimeError("expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
     return t1, t2
 
+
 def recursiveCopy(t1, t2):
     if isinstance(t2, list):
         t1 = t1 if isinstance(t1, list) else [t1]
@@ -94,10 +101,11 @@ def recursiveCopy(t1, t2):
         t1 = t1 if torch.is_tensor(t1) else t2.new()
         t1.resize_as_(t2).copy_(t2)
     else:
-        raise RuntimeError("expecting nested tensors or tables. Got " + \
-                type(t1).__name__ + " and " + type(t2).__name__ + " instead")
+        raise RuntimeError("expecting nested tensors or tables. Got " +
+                           type(t1).__name__ + " and " + type(t2).__name__ + " instead")
     return t1, t2
 
+
 def addSingletondimension(*args):
     view = None
     if len(args) < 3:
@@ -109,6 +117,7 @@ def addSingletondimension(*args):
         view.set_(t)
         return view.unsqueeze_(dim)
 
+
 def contiguousView(output, input, *args):
     if output is None:
         output = input.new()
@@ -123,9 +132,12 @@ def contiguousView(output, input, *args):
 # go over specified fields and clear them. accepts
 # nn.clearState(self, ['_buffer', '_buffer2']) and
 # nn.clearState(self, '_buffer', '_buffer2')
+
+
 def clear(self, *args):
     if len(args) == 1 and isinstance(args[0], list):
         args = args[0]
+
     def _clear(f):
         if not hasattr(self, f):
             return
diff --git a/torch/legacy/optim/adadelta.py b/torch/legacy/optim/adadelta.py
index 569634f1ad..1edd237560 100644
--- a/torch/legacy/optim/adadelta.py
+++ b/torch/legacy/optim/adadelta.py
@@ -32,7 +32,7 @@ def adadelta(opfunc, x, config, state=None):
 
     # (2) weight decay
     if wd != 0:
-      dfdx.add_(wd, x)
+        dfdx.add_(wd, x)
 
     # (3) parameter update
     if not 'paramVariance' in state:
@@ -43,7 +43,8 @@ def adadelta(opfunc, x, config, state=None):
 
     state['paramVariance'].mul_(rho).addcmul_(1 - rho, dfdx, dfdx)
     state['paramStd'].resize_as_(state['paramVariance']).copy_(state['paramVariance']).add_(eps).sqrt_()
-    state['delta'].resize_as_(state['paramVariance']).copy_(state['accDelta']).add_(eps).sqrt_().div_(state['paramStd']).mul_(dfdx)
+    state['delta'].resize_as_(state['paramVariance']).copy_(
+        state['accDelta']).add_(eps).sqrt_().div_(state['paramStd']).mul_(dfdx)
     x.add_(-1, state['delta'])
     state['accDelta'].mul_(rho).addcmul_(1 - rho, state['delta'], state['delta'])
 
diff --git a/torch/legacy/optim/adagrad.py b/torch/legacy/optim/adagrad.py
index 3c4daee4fb..29757904c5 100644
--- a/torch/legacy/optim/adagrad.py
+++ b/torch/legacy/optim/adagrad.py
@@ -32,7 +32,6 @@ def adagrad(opfunc, x, config, state=None):
     if wd != 0:
         dfdx.add_(wd, x)
 
-
     # (3) learning rate decay (annealing)
     clr = lr / (1 + state['evalCounter'] * lrd)
 
@@ -50,4 +49,3 @@ def adagrad(opfunc, x, config, state=None):
 
     # return x*, f(x) before optimization
     return x, fx
-
diff --git a/torch/legacy/optim/adam.py b/torch/legacy/optim/adam.py
index cb81225fbd..607735ab62 100644
--- a/torch/legacy/optim/adam.py
+++ b/torch/legacy/optim/adam.py
@@ -1,5 +1,6 @@
 import math
 
+
 def adam(opfunc, x, config, state=None):
     """ An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
 
@@ -59,7 +60,7 @@ def adam(opfunc, x, config, state=None):
 
     biasCorrection1 = 1 - beta1 ** state['t']
     biasCorrection2 = 1 - beta2 ** state['t']
-    stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
+    stepSize = lr * math.sqrt(biasCorrection2) / biasCorrection1
     # (3) update x
     x.addcdiv_(-stepSize, state['m'], state['denom'])
 
diff --git a/torch/legacy/optim/adamax.py b/torch/legacy/optim/adamax.py
index ebeba9f4b4..5f67ced452 100644
--- a/torch/legacy/optim/adamax.py
+++ b/torch/legacy/optim/adamax.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def adamax(opfunc, x, config, state=None):
     """ An implementation of AdaMax http://arxiv.org/pdf/1412.6980.pdf
 
diff --git a/torch/legacy/optim/asgd.py b/torch/legacy/optim/asgd.py
index e503eb9057..eb44366e74 100644
--- a/torch/legacy/optim/asgd.py
+++ b/torch/legacy/optim/asgd.py
@@ -1,5 +1,6 @@
 import math
 
+
 def asgd(opfunc, x, config, state=None):
     """ An implementation of ASGD
 
@@ -60,12 +61,11 @@ def asgd(opfunc, x, config, state=None):
     state['tmp'] = state.get('tmp', state['ax'].new().resize_as_(state['ax']))
     if state['mu_t'] != 1:
         state['tmp'].copy_(x)
-        state['tmp'].add_(-1,state['ax']).mul_(state['mu_t'])
+        state['tmp'].add_(-1, state['ax']).mul_(state['mu_t'])
         state['ax'].add_(state['tmp'])
     else:
         state['ax'].copy_(x)
 
-
     # (5) update eta_t and mu_t
     state['t'] += 1
     state['eta_t'] = config['eta0'] / math.pow((1 + config['lambda'] * config['eta0'] * state['t']), config['alpha'])
@@ -73,4 +73,3 @@ def asgd(opfunc, x, config, state=None):
 
     # return x*, f(x) before optimization, and average(x_t0,x_t1,x_t2,...)
     return x, fx, state['ax']
-
diff --git a/torch/legacy/optim/cg.py b/torch/legacy/optim/cg.py
index 9926a9c093..118de3bd96 100644
--- a/torch/legacy/optim/cg.py
+++ b/torch/legacy/optim/cg.py
@@ -2,9 +2,11 @@ import math
 
 INFINITY = float('inf')
 
+
 def sqrt_nothrow(x):
     return math.sqrt(x) if x >= 0 else float('nan')
 
+
 def cg(opfunc, x, config, state=None):
     """
 
@@ -45,11 +47,11 @@ def cg(opfunc, x, config, state=None):
     if config is None and state is None:
         raise ValueError("cg requires a dictionary to retain state between iterations")
     state = state if state is not None else config
-    rho  = config.get('rho', 0.01)
-    sig  = config.get('sig', 0.5)
-    _int  = config.get('int', 0.1)
-    ext  = config.get('ext', 3.0)
-    maxIter  = config.get('maxIter', 20)
+    rho = config.get('rho', 0.01)
+    sig = config.get('sig', 0.5)
+    _int = config.get('int', 0.1)
+    ext = config.get('ext', 3.0)
+    maxIter = config.get('maxIter', 20)
     ratio = config.get('ratio', 100)
     maxEval = config.get('maxEval', maxIter * 1.25)
     red = 1
@@ -86,13 +88,13 @@ def cg(opfunc, x, config, state=None):
     f1, tdf = opfunc(x)
     fx.append(f1)
     df1.copy_(tdf)
-    i = i+1
+    i = i + 1
 
     # initial search direction
     s.copy_(df1).mul_(-1)
 
-    d1 = -s.dot(s )         # slope
-    z1 = red/(1-d1)         # initial step
+    d1 = -s.dot(s)         # slope
+    z1 = red / (1 - d1)         # initial step
 
     while i < abs(maxEval):
         x0.copy_(x)
@@ -113,16 +115,16 @@ def cg(opfunc, x, config, state=None):
             while (f2 > f1 + z1 * rho * d1 or d2 > -sig * d1) and m > 0:
                 limit = z1
                 if f2 > f1:
-                    z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
+                    z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3)
                 else:
-                    A = 6*(f2-f3)/z3+3*(d2+d3)
-                    B = 3*(f3-f2)-z3*(d3+2*d2)
-                    z2 = (sqrt_nothrow(B*B-A*d2*z3*z3)-B)/A
+                    A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
+                    B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
+                    z2 = (sqrt_nothrow(B * B - A * d2 * z3 * z3) - B) / A
 
                 if z2 != z2 or z2 == INFINITY or z2 == -INFINITY:
-                    z2 = z3/2
+                    z2 = z3 / 2
 
-                z2 = max(min(z2, _int*z3), (1-_int)*z3)
+                z2 = max(min(z2, _int * z3), (1 - _int) * z3)
                 z1 = z1 + z2
                 x.add_(z2, s)
                 f2, tdf = opfunc(x)
@@ -134,40 +136,40 @@ def cg(opfunc, x, config, state=None):
 
             if f2 > f1 + z1 * rho * d1 or d2 > -sig * d1:
                 break
-            elif d2 > sig*d1:
+            elif d2 > sig * d1:
                 success = 1
                 break
             elif m == 0:
                 break
 
-            A = 6*(f2-f3)/z3+3*(d2+d3)
-            B = 3*(f3-f2)-z3*(d3+2*d2)
-            _denom = (B+sqrt_nothrow(B*B-A*d2*z3*z3))
-            z2 = -d2*z3*z3/_denom if _denom != 0 else float('nan')
+            A = 6 * (f2 - f3) / z3 + 3 * (d2 + d3)
+            B = 3 * (f3 - f2) - z3 * (d3 + 2 * d2)
+            _denom = (B + sqrt_nothrow(B * B - A * d2 * z3 * z3))
+            z2 = -d2 * z3 * z3 / _denom if _denom != 0 else float('nan')
 
             if z2 != z2 or z2 == INFINITY or z2 == -INFINITY or z2 < 0:
                 if limit < -0.5:
-                    z2 = z1 * (ext -1)
+                    z2 = z1 * (ext - 1)
                 else:
-                    z2 = (limit-z1)/2
-            elif (limit > -0.5) and (z2+z1) > limit:
-                z2 = (limit-z1)/2
-            elif limit < -0.5 and (z2+z1) > z1*ext:
-                z2 = z1*(ext-1)
-            elif z2 < -z3*_int:
-                z2 = -z3*_int
-            elif limit > -0.5 and z2 < (limit-z1)*(1-_int):
-                z2 = (limit-z1)*(1-_int)
+                    z2 = (limit - z1) / 2
+            elif (limit > -0.5) and (z2 + z1) > limit:
+                z2 = (limit - z1) / 2
+            elif limit < -0.5 and (z2 + z1) > z1 * ext:
+                z2 = z1 * (ext - 1)
+            elif z2 < -z3 * _int:
+                z2 = -z3 * _int
+            elif limit > -0.5 and z2 < (limit - z1) * (1 - _int):
+                z2 = (limit - z1) * (1 - _int)
 
             f3 = f2
             d3 = d2
             z3 = -z2
-            z1 = z1+z2
+            z1 = z1 + z2
             x.add_(z2, s)
 
             f2, tdf = opfunc(x)
             df2.copy_(tdf)
-            i = i+1
+            i = i + 1
             m = m - 1
             d2 = df2.dot(s)
 
@@ -212,4 +214,3 @@ def cg(opfunc, x, config, state=None):
     state['x0'] = x0
     state['s'] = s
     return x, fx, i
-
diff --git a/torch/legacy/optim/lbfgs.py b/torch/legacy/optim/lbfgs.py
index e8f377f7d0..d6d48e6b2f 100644
--- a/torch/legacy/optim/lbfgs.py
+++ b/torch/legacy/optim/lbfgs.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def lbfgs(opfunc, x, config, state=None):
     """
     An implementation of L-BFGS, heavily inspired by minFunc (Mark Schmidt)
@@ -80,8 +81,8 @@ def lbfgs(opfunc, x, config, state=None):
     if 'dir_bufs' not in state:
         # reusable buffers for y's and s's, and their histories
         verbose('creating recyclable direction/step/history buffers')
-        state['dir_bufs'] = list(g.new(nCorrection+1, p).split(1))
-        state['stp_bufs'] = list(g.new(nCorrection+1, p).split(1))
+        state['dir_bufs'] = list(g.new(nCorrection + 1, p).split(1))
+        state['stp_bufs'] = list(g.new(nCorrection + 1, p).split(1))
         for i in range(len(state['dir_bufs'])):
             state['dir_bufs'][i] = state['dir_bufs'][i].squeeze(0)
             state['stp_bufs'][i] = state['stp_bufs'][i].squeeze(0)
@@ -155,7 +156,7 @@ def lbfgs(opfunc, x, config, state=None):
             al = state['al']
 
             torch.mul(g, -1, out=q)
-            for i in range(k-1, -1, -1):
+            for i in range(k - 1, -1, -1):
                 al[i] = old_dirs[i].dot(q) * ro[i]
                 q.add_(-al[i], old_stps[i])
 
@@ -193,7 +194,7 @@ def lbfgs(opfunc, x, config, state=None):
         lsFuncEval = 0
         if lineSearch is not None:
             # perform line search, using user function
-            f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
+            f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd, lineSearchOpts)
             f_hist.append(f)
         else:
             # no line search, simply move with fixed-step
@@ -250,4 +251,4 @@ def lbfgs(opfunc, x, config, state=None):
     state['d'] = d
 
     # return optimal x, and history of f(x)
-    return x,f_hist,currentFuncEval
+    return x, f_hist, currentFuncEval
diff --git a/torch/legacy/optim/nag.py b/torch/legacy/optim/nag.py
index 9be9a106aa..8a8196fe7c 100644
--- a/torch/legacy/optim/nag.py
+++ b/torch/legacy/optim/nag.py
@@ -40,7 +40,6 @@ def nag(opfunc, x, config, state=None):
     if mom <= 0:
         raise ValueError('Momentum must be positive for Nesterov Accelerated Gradient')
 
-
     # (1) evaluate f(x) and df/dx
     # first step in the direction of the momentum vector
 
@@ -55,7 +54,6 @@ def nag(opfunc, x, config, state=None):
     if wd != 0:
         dfdx.add_(wd, x)
 
-
     # (3) learning rate decay (annealing)
     clr = lr / (1 + state['evalCounter'] * lrd)
 
@@ -65,7 +63,6 @@ def nag(opfunc, x, config, state=None):
     else:
         state['dfdx'].mul_(mom)
 
-
     # (5) parameter update with single or individual learning rates
     if lrs is not None:
         if 'deltaParameters' in state:
@@ -78,10 +75,8 @@ def nag(opfunc, x, config, state=None):
         x.add_(-clr, dfdx)
         state['dfdx'].add_(-clr, dfdx)
 
-
     # (6) update evaluation counter
     state['evalCounter'] += 1
 
     # return x, f(x) before optimization
     return x, fx
-
diff --git a/torch/legacy/optim/rmsprop.py b/torch/legacy/optim/rmsprop.py
index 039b9cf399..351c8c3fe6 100644
--- a/torch/legacy/optim/rmsprop.py
+++ b/torch/legacy/optim/rmsprop.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def rmsprop(opfunc, x, config, state=None):
     """ An implementation of RMSprop
 
@@ -44,10 +45,9 @@ def rmsprop(opfunc, x, config, state=None):
         state['m'] = x.new().resize_as_(dfdx).zero_()
         state['tmp'] = x.new().resize_as_(dfdx)
 
-
     # (4) calculate new (leaky) mean squared values
     state['m'].mul_(alpha)
-    state['m'].addcmul_(1.0-alpha, dfdx, dfdx)
+    state['m'].addcmul_(1.0 - alpha, dfdx, dfdx)
 
     # (5) perform update
     torch.sqrt(state['m'], out=state['tmp']).add_(epsilon)
diff --git a/torch/legacy/optim/rprop.py b/torch/legacy/optim/rprop.py
index 6d879b2610..691ef3c208 100644
--- a/torch/legacy/optim/rprop.py
+++ b/torch/legacy/optim/rprop.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def rprop(opfunc, x, config, state=None):
     """ A plain implementation of RPROP
 
@@ -42,22 +43,20 @@ def rprop(opfunc, x, config, state=None):
 
         # init temp storage
         if not 'delta' in state:
-            state['delta']    = dfdx.new(dfdx.size()).zero_()
+            state['delta'] = dfdx.new(dfdx.size()).zero_()
             state['stepsize'] = dfdx.new(dfdx.size()).fill_(stepsize)
-            state['sign']     = dfdx.new(dfdx.size())
+            state['sign'] = dfdx.new(dfdx.size())
             state['bytesign'] = torch.ByteTensor(dfdx.size())
-            state['psign']    = torch.ByteTensor(dfdx.size())
-            state['nsign']    = torch.ByteTensor(dfdx.size())
-            state['zsign']    = torch.ByteTensor(dfdx.size())
-            state['dminmax']  = torch.ByteTensor(dfdx.size())
+            state['psign'] = torch.ByteTensor(dfdx.size())
+            state['nsign'] = torch.ByteTensor(dfdx.size())
+            state['zsign'] = torch.ByteTensor(dfdx.size())
+            state['dminmax'] = torch.ByteTensor(dfdx.size())
             if str(type(x)).find('Cuda') > -1:
                 # Push to GPU
-                state['psign']    = state['psign'].cuda()
-                state['nsign']    = state['nsign'].cuda()
-                state['zsign']    = state['zsign'].cuda()
-                state['dminmax']  = state['dminmax'].cuda()
-
-
+                state['psign'] = state['psign'].cuda()
+                state['nsign'] = state['nsign'].cuda()
+                state['zsign'] = state['zsign'].cuda()
+                state['dminmax'] = state['dminmax'].cuda()
 
         # sign of derivative from last step to this one
         torch.mul(dfdx, state['delta'], out=state['sign']).sign_()
@@ -98,4 +97,3 @@ def rprop(opfunc, x, config, state=None):
 
     # return x*, table of f(x) values from each step
     return x, hfx
-
diff --git a/torch/legacy/optim/sgd.py b/torch/legacy/optim/sgd.py
index a6a6e817d0..69b756d6e2 100644
--- a/torch/legacy/optim/sgd.py
+++ b/torch/legacy/optim/sgd.py
@@ -1,5 +1,6 @@
 import torch
 
+
 def sgd(opfunc, x, config, state=None):
     """A plain implementation of SGD
 
@@ -62,7 +63,7 @@ def sgd(opfunc, x, config, state=None):
         if 'dfdx' not in state:
             state['dfdx'] = torch.Tensor().type_as(dfdx).resize_as_(dfdx).copy_(dfdx)
         else:
-            state['dfdx'].mul_(mom).add_(1-damp, dfdx)
+            state['dfdx'].mul_(mom).add_(1 - damp, dfdx)
 
         if nesterov:
             dfdx.add_(mom, state['dfdx'])
@@ -82,7 +83,6 @@ def sgd(opfunc, x, config, state=None):
     else:
         x.add_(-clr, dfdx)
 
-
     # (6) update evaluation counter
     state['evalCounter'] += 1
 
diff --git a/torch/multiprocessing/queue.py b/torch/multiprocessing/queue.py
index cc83b536bc..ad0a32b08b 100644
--- a/torch/multiprocessing/queue.py
+++ b/torch/multiprocessing/queue.py
@@ -26,6 +26,7 @@ class ConnectionWrapper(object):
 
 
 class Queue(multiprocessing.queues.Queue):
+
     def __init__(self, *args, **kwargs):
         super(Queue, self).__init__(*args, **kwargs)
         self._reader = ConnectionWrapper(self._reader)
@@ -35,6 +36,7 @@ class Queue(multiprocessing.queues.Queue):
 
 
 class SimpleQueue(multiprocessing.queues.SimpleQueue):
+
     def _make_methods(self):
         if not isinstance(self._reader, ConnectionWrapper):
             self._reader = ConnectionWrapper(self._reader)
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index e7e384b25a..4c2799c264 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -20,6 +20,7 @@ except ImportError:
 class StorageRef(object):
     # An object with a cdata field which may be set to None. We subclass object
     # instead of using a dict() to support weak references.
+
     def __init__(self, ptr):
         self.cdata = ptr
 
diff --git a/torch/nn/_functions/activation.py b/torch/nn/_functions/activation.py
index 4caaf0de82..433e8eee62 100644
--- a/torch/nn/_functions/activation.py
+++ b/torch/nn/_functions/activation.py
@@ -15,4 +15,3 @@ class Softsign(Function):
             self.buffer_squared = True
         grad_input = grad_output.clone().div_(self.buffer)
         return grad_input
-
diff --git a/torch/nn/_functions/batchnorm.py b/torch/nn/_functions/batchnorm.py
index 6d80eb5d5e..ecf125f1ca 100644
--- a/torch/nn/_functions/batchnorm.py
+++ b/torch/nn/_functions/batchnorm.py
@@ -5,6 +5,7 @@ import torch.backends.cudnn as cudnn
 
 
 class BatchNorm(Function):
+
     def __init__(self, running_mean, running_var, training, momentum, eps):
         super(BatchNorm, self).__init__()
         self.running_mean = running_mean
diff --git a/torch/nn/_functions/conv.py b/torch/nn/_functions/conv.py
index 6ba0b989d5..f9adca22f6 100644
--- a/torch/nn/_functions/conv.py
+++ b/torch/nn/_functions/conv.py
@@ -9,6 +9,7 @@ _thnn_convs = {}
 
 
 class ConvNd(Function):
+
     def __init__(self, stride, padding, dilation, transposed, output_padding,
                  groups):
         super(ConvNd, self).__init__()
@@ -161,7 +162,7 @@ class ConvNd(Function):
                 res.append(impl[fn_name](self, self._bufs[g], *grouped_args))
             if fn_name == 'grad_params':
                 return [torch.cat(t, 0) if t[0] is not None else None
-                                        for t in zip(*res)]
+                        for t in zip(*res)]
             else:
                 return torch.cat(res, 1)
 
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
index 734eedc478..49ab427a8c 100644
--- a/torch/nn/_functions/dropout.py
+++ b/torch/nn/_functions/dropout.py
@@ -9,7 +9,7 @@ class Dropout(InplaceFunction):
         super(Dropout, self).__init__()
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
-                    "but got {}".format(p))
+                             "but got {}".format(p))
         self.p = p
         self.train = train
         self.inplace = inplace
@@ -26,7 +26,7 @@ class Dropout(InplaceFunction):
 
         if self.p > 0 and self.train:
             self.noise = self._make_noise(input)
-            self.noise.bernoulli_(1-self.p).div_(1-self.p)
+            self.noise.bernoulli_(1 - self.p).div_(1 - self.p)
             if self.p == 1:
                 self.noise.fill_(0)
             self.noise = self.noise.expand_as(input)
@@ -45,4 +45,4 @@ class FeatureDropout(Dropout):
 
     def _make_noise(self, input):
         return input.new().resize_(input.size(0), input.size(1),
-                *repeat(1, input.dim()-2))
+                                   *repeat(1, input.dim() - 2))
diff --git a/torch/nn/_functions/linear.py b/torch/nn/_functions/linear.py
index 279ea9c2b5..43bae458d5 100644
--- a/torch/nn/_functions/linear.py
+++ b/torch/nn/_functions/linear.py
@@ -29,4 +29,3 @@ class Linear(Function):
             return grad_input, grad_weight, grad_bias
         else:
             return grad_input, grad_weight
-
diff --git a/torch/nn/_functions/loss.py b/torch/nn/_functions/loss.py
index 3a54f7be51..433b0e5e44 100644
--- a/torch/nn/_functions/loss.py
+++ b/torch/nn/_functions/loss.py
@@ -16,9 +16,9 @@ class CosineEmbeddingLoss(Function):
             return torch.ByteTensor()
 
     def forward(self, input1, input2, y):
-        self.w1  = input1.new()
+        self.w1 = input1.new()
         self.w22 = input1.new()
-        self.w  = input1.new()
+        self.w = input1.new()
         self.w32 = input1.new()
         self._outputs = input1.new()
 
@@ -98,6 +98,7 @@ class CosineEmbeddingLoss(Function):
 
 
 class HingeEmbeddingLoss(Function):
+
     def __init__(self, margin=1, size_average=True):
         super(HingeEmbeddingLoss, self).__init__()
         self.margin = margin
@@ -176,4 +177,3 @@ class MarginRankingLoss(Function):
             grad_input2.div_(y.size(0))
 
         return grad_input1, grad_input2, None
-
diff --git a/torch/nn/_functions/rnn.py b/torch/nn/_functions/rnn.py
index 00e7b70897..eab7a5d65a 100644
--- a/torch/nn/_functions/rnn.py
+++ b/torch/nn/_functions/rnn.py
@@ -8,43 +8,43 @@ except ImportError:
 
 
 def RNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
-        hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
-        return hy
+    hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
+    return hy
 
 
 def RNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
-        hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
-        return hy
+    hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
+    return hy
 
 
 def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
-        hx, cx = hidden
-        gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
-        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+    hx, cx = hidden
+    gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
+    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 
-        ingate = F.sigmoid(ingate)
-        forgetgate = F.sigmoid(forgetgate)
-        cellgate = F.tanh(cellgate)
-        outgate = F.sigmoid(outgate)
+    ingate = F.sigmoid(ingate)
+    forgetgate = F.sigmoid(forgetgate)
+    cellgate = F.tanh(cellgate)
+    outgate = F.sigmoid(outgate)
 
-        cy = (forgetgate * cx) + (ingate * cellgate)
-        hy = outgate * F.tanh(cy)
+    cy = (forgetgate * cx) + (ingate * cellgate)
+    hy = outgate * F.tanh(cy)
 
-        return hy, cy
+    return hy, cy
 
 
 def GRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
-        gi = F.linear(input, w_ih, b_ih)
-        gh = F.linear(hidden, w_hh, b_hh)
-        i_r, i_i, i_n = gi.chunk(3, 1)
-        h_r, h_i, h_n = gh.chunk(3, 1)
+    gi = F.linear(input, w_ih, b_ih)
+    gh = F.linear(hidden, w_hh, b_hh)
+    i_r, i_i, i_n = gi.chunk(3, 1)
+    h_r, h_i, h_n = gh.chunk(3, 1)
 
-        resetgate = F.sigmoid(i_r + h_r)
-        inputgate = F.sigmoid(i_i + h_i)
-        newgate = F.tanh(i_n + resetgate * h_n)
-        hy = newgate + inputgate * (hidden - newgate)
+    resetgate = F.sigmoid(i_r + h_r)
+    inputgate = F.sigmoid(i_i + h_i)
+    newgate = F.tanh(i_n + resetgate * h_n)
+    hy = newgate + inputgate * (hidden - newgate)
 
-        return hy
+    return hy
 
 
 def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
@@ -87,6 +87,7 @@ def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
 
     return forward
 
+
 def Recurrent(inner, reverse=False):
     def forward(input, hidden, weight):
         output = []
@@ -144,7 +145,8 @@ def AutogradRNN(mode, input_size, hidden_size, num_layers=1, batch_first=False,
 
 
 class CudnnRNN(NestedIOFunction):
-    def __init__(self, mode, input_size, hidden_size, num_layers=1, batch_first=False, dropout=0,  train=True, bidirectional=False, dropout_state=None):
+
+    def __init__(self, mode, input_size, hidden_size, num_layers=1, batch_first=False, dropout=0, train=True, bidirectional=False, dropout_state=None):
         super(CudnnRNN, self).__init__()
         if dropout_state is None:
             dropout_state = {}
@@ -177,7 +179,6 @@ class CudnnRNN(NestedIOFunction):
         self.save_for_backward(input, hx, weight, output)
         return output, hy
 
-
     def backward_extended(self, grad_output, grad_hy):
         input, hx, weight, output = self.saved_tensors
 
diff --git a/torch/nn/_functions/thnn/activation.py b/torch/nn/_functions/thnn/activation.py
index 3ac85044fc..a37f004341 100644
--- a/torch/nn/_functions/thnn/activation.py
+++ b/torch/nn/_functions/thnn/activation.py
@@ -129,4 +129,3 @@ class Softmin(Function):
 _all_functions.append(PReLU)
 _all_functions.append(RReLU)
 _all_functions.append(Softmin)
-
diff --git a/torch/nn/_functions/thnn/auto.py b/torch/nn/_functions/thnn/auto.py
index be8fef82dc..f3ef7057ac 100644
--- a/torch/nn/_functions/thnn/auto.py
+++ b/torch/nn/_functions/thnn/auto.py
@@ -32,20 +32,20 @@ def _make_function_class_criterion(class_name, update_output, update_grad_input,
         self._backend = type2backend[type(input)]
         self.save_for_backward(input, target)
         if weight_arg_idx >= 0:
-            insert_idx = weight_arg_idx - 4 # state, input, target, output
+            insert_idx = weight_arg_idx - 4  # state, input, target, output
             self.additional_args.insert(insert_idx, self.weight)
         for idx in buffers_idx:
             self.additional_args.insert(idx, input.new(1))
         output = input.new(1)
         getattr(self._backend, update_output.name)(self._backend.library_state, input, target,
-            output, *self.additional_args)
+                                                   output, *self.additional_args)
         return output
 
     def backward(self, grad_output):
         input, target = self.saved_tensors
         grad_input = grad_output.new().resize_as_(input).zero_()
         getattr(self._backend, update_grad_input.name)(self._backend.library_state, input, target,
-            grad_input, *self.additional_args)
+                                                       grad_input, *self.additional_args)
         grad_output_expanded = grad_output.view(*repeat(1, grad_input.dim()))
         grad_input.mul_(grad_output_expanded.expand_as(grad_input))
         return grad_input, None
@@ -76,15 +76,15 @@ def _make_function_class(class_name, update_output, update_grad_input, acc_grad_
     param_args = {'weight', 'bias'}
     ignored_args = {'weight', 'bias', 'gradWeight', 'gradBias', 'output'}
     expected_params = [arg for arg in update_output.arguments[3:]
-            if arg.name in param_args]
+                       if arg.name in param_args]
     buffers = {}
     buffers['update_output'] = _find_buffers(update_output.arguments[3:],
-            ignored_args)
+                                             ignored_args)
     buffers['update_grad_input'] = _find_buffers(
-            update_grad_input.arguments[4:], ignored_args)
+        update_grad_input.arguments[4:], ignored_args)
     if acc_grad_parameters is not None:
         buffers['acc_grad_parameters'] = _find_buffers(
-                acc_grad_parameters.arguments[3:], ignored_args)
+            acc_grad_parameters.arguments[3:], ignored_args)
 
     # This and __init__ assume that only the last argument can be
     # an inplace flag
@@ -112,8 +112,8 @@ def _make_function_class(class_name, update_output, update_grad_input, acc_grad_
         for param in params:
             if type(param) != type(input):
                 raise RuntimeError("input type ({}) doesn't match the type of "
-                        "a parameter tensor ({})".format(torch.typename(input),
-                            torch.typename(param)))
+                                   "a parameter tensor ({})".format(torch.typename(input),
+                                                                    torch.typename(param)))
 
         # Allocate temporary buffers and insert them into additional_args
         self.buffers = defaultdict(type(input))
@@ -246,10 +246,10 @@ def _generate_function_classes(scope_dict):
         # This has to call a function to retain correct references to functions
         if 'Criterion' in fn:
             cls = _make_function_class_criterion(class_name, update_output,
-                    update_grad_input, acc_grad_parameters)
+                                                 update_grad_input, acc_grad_parameters)
         else:
             cls = _make_function_class(class_name, update_output,
-                    update_grad_input, acc_grad_parameters)
+                                       update_grad_input, acc_grad_parameters)
         scope_dict[class_name] = cls
         if not class_name.startswith('_'):
             _all_functions.append(cls)
diff --git a/torch/nn/_functions/thnn/loss.py b/torch/nn/_functions/thnn/loss.py
index 270414a3d8..72cc1d1b0f 100644
--- a/torch/nn/_functions/thnn/loss.py
+++ b/torch/nn/_functions/thnn/loss.py
@@ -34,4 +34,3 @@ class BCELoss(_BCELoss):
 
 
 _all_functions.append(BCELoss)
-
diff --git a/torch/nn/_functions/thnn/normalization.py b/torch/nn/_functions/thnn/normalization.py
index 13ff3e2e1a..a80ad84cc2 100644
--- a/torch/nn/_functions/thnn/normalization.py
+++ b/torch/nn/_functions/thnn/normalization.py
@@ -42,10 +42,10 @@ class CrossMapLRN2d(Function):
                 self.k
             )
         else:
-            batch_size   = input.size(0)
-            channels    = input.size(1)
+            batch_size = input.size(0)
+            channels = input.size(1)
             input_height = input.size(2)
-            input_width  = input.size(3)
+            input_width = input.size(3)
 
             output.resize_as_(input)
             self.scale.resize_as_(input)
@@ -54,7 +54,7 @@ class CrossMapLRN2d(Function):
             input_square = output
             torch.pow(input, 2, out=input_square)
 
-            pre_pad = int((self.size - 1)/2 + 1)
+            pre_pad = int((self.size - 1) / 2 + 1)
             pre_pad_crop = channels if pre_pad > channels else pre_pad
 
             scale_first = self.scale.select(1, 0)
@@ -67,7 +67,7 @@ class CrossMapLRN2d(Function):
             # by adding the next feature map and removing the previous
             for c in range(1, channels):
                 scale_previous = self.scale.select(1, c - 1)
-                scale_current  = self.scale.select(1, c)
+                scale_current = self.scale.select(1, c)
                 scale_current.copy_(scale_previous)
                 if c < channels - pre_pad + 1:
                     square_next = input_square.select(1, c + pre_pad - 1)
@@ -103,13 +103,13 @@ class CrossMapLRN2d(Function):
                 self.k
             )
         else:
-            batch_size   = input.size(0)
-            channels    = input.size(1)
+            batch_size = input.size(0)
+            channels = input.size(1)
             input_height = input.size(2)
-            input_width  = input.size(3)
+            input_width = input.size(3)
 
             paddded_ratio = input.new(channels + self.size - 1, input_height,
-                    input_width)
+                                      input_width)
             accum_ratio = input.new(input_height, input_width)
 
             cache_ratio_value = 2 * self.alpha * self.beta / self.size
@@ -120,16 +120,16 @@ class CrossMapLRN2d(Function):
 
             paddded_ratio.zero_()
             padded_ratio_center = paddded_ratio.narrow(0, inversePrePad,
-                    channels)
+                                                       channels)
             for n in range(batch_size):
                 torch.mul(grad_output[n], output[n], out=padded_ratio_center)
                 padded_ratio_center.div_(self.scale[n])
                 torch.sum(
-                        paddded_ratio.narrow(0, 0, self.size-1), 0, out=accum_ratio)
+                    paddded_ratio.narrow(0, 0, self.size - 1), 0, out=accum_ratio)
                 for c in range(channels):
-                    accum_ratio.add_(paddded_ratio[c+self.size-1])
+                    accum_ratio.add_(paddded_ratio[c + self.size - 1])
                     grad_input[n][c].addcmul_(-cache_ratio_value, input[n][c],
-                            accum_ratio)
+                                              accum_ratio)
                     accum_ratio.add_(-1, paddded_ratio[c])
 
         return grad_input
diff --git a/torch/nn/_functions/thnn/pooling.py b/torch/nn/_functions/thnn/pooling.py
index b771f5ff34..701101c76f 100644
--- a/torch/nn/_functions/thnn/pooling.py
+++ b/torch/nn/_functions/thnn/pooling.py
@@ -6,6 +6,7 @@ from torch.nn.modules.utils import _pair, _triple
 
 
 class MaxPool1d(Function):
+
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
                  return_indices=False, ceil_mode=False):
         self.kernel_size = kernel_size
@@ -19,12 +20,12 @@ class MaxPool1d(Function):
         backend = type2backend[type(input)]
         indices, output = input.new().long(), input.new()
         backend.SpatialDilatedMaxPooling_updateOutput(backend.library_state,
-                input, output, indices,
-                self.kernel_size, 1,
-                self.stride, 1,
-                self.pad, 0,
-                self.dilation, 1,
-                self.ceil_mode)
+                                                      input, output, indices,
+                                                      self.kernel_size, 1,
+                                                      self.stride, 1,
+                                                      self.pad, 0,
+                                                      self.dilation, 1,
+                                                      self.ceil_mode)
         if indices.dim() == 4:
             # TODO: fix when THCUNN handles 3D indices properly
             indices = indices.squeeze(0)
@@ -49,15 +50,17 @@ class MaxPool1d(Function):
         grad_input = grad_output.new()
         backend = type2backend[type(input)]
         backend.SpatialDilatedMaxPooling_updateGradInput(backend.library_state,
-                input, grad_output, grad_input, indices,
-                self.kernel_size, 1,
-                self.stride, 1,
-                self.pad, 0,
-                self.dilation, 1,
-                self.ceil_mode)
+                                                         input, grad_output, grad_input, indices,
+                                                         self.kernel_size, 1,
+                                                         self.stride, 1,
+                                                         self.pad, 0,
+                                                         self.dilation, 1,
+                                                         self.ceil_mode)
         return grad_input
 
+
 class MaxPool2d(Function):
+
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
                  return_indices=False, ceil_mode=False):
         self.kernel_size = _pair(kernel_size)
@@ -71,12 +74,12 @@ class MaxPool2d(Function):
         backend = type2backend[type(input)]
         indices, output = input.new().long(), input.new()
         backend.SpatialDilatedMaxPooling_updateOutput(backend.library_state,
-                input, output, indices,
-                self.kernel_size[1], self.kernel_size[0],
-                self.stride[1], self.stride[0],
-                self.padding[1], self.padding[0],
-                self.dilation[1], self.dilation[0],
-                self.ceil_mode)
+                                                      input, output, indices,
+                                                      self.kernel_size[1], self.kernel_size[0],
+                                                      self.stride[1], self.stride[0],
+                                                      self.padding[1], self.padding[0],
+                                                      self.dilation[1], self.dilation[0],
+                                                      self.ceil_mode)
         if self.return_indices:
             self.save_for_backward(input, indices)
             self.mark_non_differentiable(indices)
@@ -95,21 +98,22 @@ class MaxPool2d(Function):
         grad_input = grad_output.new()
         backend = type2backend[type(input)]
         backend.SpatialDilatedMaxPooling_updateGradInput(backend.library_state,
-                input, grad_output, grad_input, indices,
-                self.kernel_size[1], self.kernel_size[0],
-                self.stride[1], self.stride[0],
-                self.padding[1], self.padding[0],
-                self.dilation[1], self.dilation[0],
-                self.ceil_mode)
+                                                         input, grad_output, grad_input, indices,
+                                                         self.kernel_size[1], self.kernel_size[0],
+                                                         self.stride[1], self.stride[0],
+                                                         self.padding[1], self.padding[0],
+                                                         self.dilation[1], self.dilation[0],
+                                                         self.ceil_mode)
         return grad_input
 
+
 class MaxPool3d(Function):
 
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
                  return_indices=False, ceil_mode=False):
         self.kernel_size = _triple(kernel_size)
         self.stride = _triple(stride if stride is not None else kernel_size)
-        self.padding  = _triple(padding)
+        self.padding = _triple(padding)
         self.dilation = _triple(dilation)
         self.return_indices = return_indices
         self.ceil_mode = ceil_mode
@@ -118,12 +122,12 @@ class MaxPool3d(Function):
         backend = type2backend[type(input)]
         indices, output = input.new().long(), input.new()
         backend.VolumetricDilatedMaxPooling_updateOutput(backend.library_state,
-                input, output, indices,
-                self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
-                self.stride[0], self.stride[2], self.stride[1],
-                self.padding[0], self.padding[2], self.padding[1],
-                self.dilation[0], self.dilation[2], self.dilation[1],
-                self.ceil_mode)
+                                                         input, output, indices,
+                                                         self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
+                                                         self.stride[0], self.stride[2], self.stride[1],
+                                                         self.padding[0], self.padding[2], self.padding[1],
+                                                         self.dilation[0], self.dilation[2], self.dilation[1],
+                                                         self.ceil_mode)
         if self.return_indices:
             self.save_for_backward(input, indices)
             self.mark_non_differentiable(indices)
@@ -142,16 +146,18 @@ class MaxPool3d(Function):
         grad_input = grad_output.new()
         backend = type2backend[type(input)]
         backend.VolumetricDilatedMaxPooling_updateGradInput(backend.library_state,
-                input, grad_output, grad_input, indices,
-                self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
-                self.stride[0], self.stride[2], self.stride[1],
-                self.padding[0], self.padding[2], self.padding[1],
-                self.dilation[0], self.dilation[2], self.dilation[1],
-                self.ceil_mode)
+                                                            input, grad_output, grad_input, indices,
+                                                            self.kernel_size[0], self.kernel_size[
+                                                                2], self.kernel_size[1],
+                                                            self.stride[0], self.stride[2], self.stride[1],
+                                                            self.padding[0], self.padding[2], self.padding[1],
+                                                            self.dilation[0], self.dilation[2], self.dilation[1],
+                                                            self.ceil_mode)
         return grad_input
 
 
 class MaxUnpool2d(Function):
+
     def __init__(self, output_size):
         super(MaxUnpool2d, self).__init__()
         self.output_size = output_size
@@ -175,6 +181,7 @@ class MaxUnpool2d(Function):
 
 
 class MaxUnpool3d(Function):
+
     def __init__(self, output_size, stride, padding):
         super(MaxUnpool3d, self).__init__()
         self.output_size = output_size
@@ -206,7 +213,7 @@ class MaxUnpool3d(Function):
 class FractionalMaxPool2d(Function):
 
     def __init__(self, kh, kw, output_size=None, output_ratio=None,
-            return_indices=False, _random_samples=None):
+                 return_indices=False, _random_samples=None):
         super(FractionalMaxPool2d, self).__init__()
 
         # Pool size (how wide the pooling for each output unit is)
@@ -234,7 +241,7 @@ class FractionalMaxPool2d(Function):
     def forward(self, input):
         if self.random_samples is None:
             random_samples = input.new().resize_(input.size(0),
-                    input.size(1), 2).uniform_()
+                                                 input.size(1), 2).uniform_()
         else:
             random_samples = self.random_samples
             self.random_samples = None
@@ -257,7 +264,7 @@ class FractionalMaxPool2d(Function):
             random_samples
         )
 
-        self.random_samples = None # Free unnecessary buffers
+        self.random_samples = None  # Free unnecessary buffers
         if self.return_indices:
             self.save_for_backward(input, indices)
             return output, indices
@@ -287,6 +294,7 @@ class FractionalMaxPool2d(Function):
 
 
 class AvgPool2d(Function):
+
     def __init__(self, kernel_size, stride=None, padding=0,
                  ceil_mode=False, count_include_pad=True):
         self.kernel_size = _pair(kernel_size)
@@ -322,7 +330,9 @@ class AvgPool2d(Function):
             self.ceil_mode, self.count_include_pad)
         return grad_input
 
+
 class AvgPool3d(Function):
+
     def __init__(self, kernel_size, stride=None):
         self.kernel_size = _triple(kernel_size)
         self.stride = _triple(stride if stride is not None else kernel_size)
@@ -333,9 +343,9 @@ class AvgPool3d(Function):
         # can avoid this with cudnn
         self.save_for_backward(input)
         backend.VolumetricAveragePooling_updateOutput(backend.library_state,
-                input, output,
-                self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
-                self.stride[0], self.stride[2], self.stride[1])
+                                                      input, output,
+                                                      self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
+                                                      self.stride[0], self.stride[2], self.stride[1])
         return output
 
     def backward(self, grad_output):
@@ -343,9 +353,9 @@ class AvgPool3d(Function):
         input, = self.saved_tensors
         grad_input = grad_output.new()
         backend.VolumetricAveragePooling_updateGradInput(backend.library_state,
-                input, grad_output, grad_input,
-                self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
-                self.stride[0], self.stride[2], self.stride[1])
+                                                         input, grad_output, grad_input,
+                                                         self.kernel_size[0], self.kernel_size[2], self.kernel_size[1],
+                                                         self.stride[0], self.stride[2], self.stride[1])
         return grad_input
 
 _all_functions.append(AvgPool2d)
diff --git a/torch/nn/_functions/thnn/sparse.py b/torch/nn/_functions/thnn/sparse.py
index e717cd26cc..0d4e55e6a4 100644
--- a/torch/nn/_functions/thnn/sparse.py
+++ b/torch/nn/_functions/thnn/sparse.py
@@ -33,7 +33,7 @@ class Embedding(Function):
     def _make_sparse(self, indices):
         i = torch.LongTensor(2, indices.numel())
         v = torch.ones(indices.numel())
-        i[1].copy_(torch.range(0, indices.numel()-1))
+        i[1].copy_(torch.range(0, indices.numel() - 1))
         i[0].copy_(indices)
         return sparse.FloatTensor(i, v, torch.Size(
             [self._weight_size[0], indices.numel()])).contiguous()
diff --git a/torch/nn/_functions/thnn/upsampling.py b/torch/nn/_functions/thnn/upsampling.py
index 3ffbd999de..9faa8ad7c6 100644
--- a/torch/nn/_functions/thnn/upsampling.py
+++ b/torch/nn/_functions/thnn/upsampling.py
@@ -5,7 +5,9 @@ from torch._thnn import type2backend
 
 from . import _all_functions
 
+
 class _UpsamplingBase(Function):
+
     def __init__(self, size=None, scale_factor=None):
         super(_UpsamplingBase, self).__init__()
         if size is None and scale_factor is None:
@@ -25,15 +27,15 @@ class UpsamplingNearest2d(_UpsamplingBase):
 
         if self.scale_factor is None:
             if (self.size[0] % input.size(2) != 0 or
-                self.size[1] % input.size(3) != 0):
+                    self.size[1] % input.size(3) != 0):
                 raise RuntimeError("output size specified in UpSamplingNearest "
-                        "({}) has to be divisible by the input size, but got: "
-                        "{}".format('x'.join(map(str, self.size)),
-                                    'x'.join(map(str, input.size()))))
+                                   "({}) has to be divisible by the input size, but got: "
+                                   "{}".format('x'.join(map(str, self.size)),
+                                               'x'.join(map(str, input.size()))))
             self.scale_factor = self.size[0] // input.size(2)
             if self.scale_factor != self.size[1] // input.size(3):
                 raise RuntimeError("input aspect ratio doesn't match the "
-                        "output ratio")
+                                   "output ratio")
 
         output = input.new()
         backend = type2backend[type(input)]
diff --git a/torch/nn/backends/__init__.py b/torch/nn/backends/__init__.py
index 8b13789179..e69de29bb2 100644
--- a/torch/nn/backends/__init__.py
+++ b/torch/nn/backends/__init__.py
@@ -1 +0,0 @@
-
diff --git a/torch/nn/backends/backend.py b/torch/nn/backends/backend.py
index 104d4a3eba..fb5424b67a 100644
--- a/torch/nn/backends/backend.py
+++ b/torch/nn/backends/backend.py
@@ -1,5 +1,6 @@
 
 class FunctionBackend(object):
+
     def __init__(self):
         self.function_classes = {}
 
@@ -13,4 +14,3 @@ class FunctionBackend(object):
         if self.function_classes.get(name):
             raise RuntimeError("Trying to register second function under name " + name + " in " + type(self).__name__)
         self.function_classes[name] = function_class
-
diff --git a/torch/nn/backends/thnn.py b/torch/nn/backends/thnn.py
index 4a761eff0e..9e2f14b54a 100644
--- a/torch/nn/backends/thnn.py
+++ b/torch/nn/backends/thnn.py
@@ -1,5 +1,6 @@
 from .backend import FunctionBackend
 
+
 class THNNFunctionBackend(FunctionBackend):
 
     def __reduce__(self):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index a46d932985..b4a16fd450 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -289,7 +289,7 @@ def max_unpool3d(input, indices, kernel_size, stride=None, padding=0,
 def lp_pool2d(input, norm_type, kernel_size, stride=None, ceil_mode=False):
     kw, kh = utils._pair(kernel_size)
     out = avg_pool2d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
-    return out.mul(kw * kh).pow(1./norm_type)
+    return out.mul(kw * kh).pow(1. / norm_type)
 
 
 # Activation functions
@@ -326,7 +326,7 @@ def prelu(input, weight):
     return _functions.thnn.PReLU()(input, weight)
 
 
-def rrelu(input, lower=1./8, upper=1./3, training=False, inplace=False):
+def rrelu(input, lower=1. / 8, upper=1. / 3, training=False, inplace=False):
     return _functions.thnn.RReLU(lower, upper, training, inplace)(input)
 
 
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 2703c03c34..059706a9d4 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -6,9 +6,9 @@ from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
     Softmax, Softmax2d, LogSoftmax, ELU, Hardshrink, LeakyReLU, LogSigmoid, \
     Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, NLLLoss2d, \
-        CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
-        MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
-        SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss
+    CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
+    SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss
 from .container import Container, Sequential
 from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
     MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, LPPool2d
@@ -18,6 +18,6 @@ from .padding import ReflectionPad2d, ReplicationPad2d, ReplicationPad3d
 from .normalization import CrossMapLRN2d
 from .sparse import Embedding
 from .rnn import RNNBase, RNN, LSTM, GRU, \
-        RNNCell, LSTMCell, GRUCell
+    RNNCell, LSTMCell, GRUCell
 from .pixelshuffle import PixelShuffle
 from .upsampling import UpsamplingNearest2d, UpsamplingBillinear2d
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 4c90bfd9ec..be3f17f7f8 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -9,7 +9,7 @@ class Threshold(Module):
     """Thresholds each element of the input Tensor
 
     Threshold is defined as::
-    
+
          y =  x        if x >= threshold
               value    if x <  threshold
 
@@ -29,6 +29,7 @@ class Threshold(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, threshold, value, inplace=False):
         super(Threshold, self).__init__()
         self.threshold = threshold
@@ -40,7 +41,7 @@ class Threshold(Module):
         return F.threshold(input, self.threshold, self.value, self.inplace)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + str(self.threshold) \
             + ', ' + str(self.value) \
@@ -64,17 +65,19 @@ class ReLU(Threshold):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, inplace=False):
         super(ReLU, self).__init__(0, 0, inplace)
 
     def __repr__(self):
-        inplace_str='inplace' if self.inplace else ''
+        inplace_str = 'inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + inplace_str + ')'
 
 
 class RReLU(Module):
-    def __init__(self, lower=1./8, upper=1./3, inplace=False):
+
+    def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
         super(RReLU, self).__init__()
         self.lower = lower
         self.upper = upper
@@ -84,7 +87,7 @@ class RReLU(Module):
         return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + str(self.lower) \
             + ', ' + str(self.upper) \
@@ -95,7 +98,7 @@ class Hardtanh(Module):
     """Applies the HardTanh function element-wise
 
     HardTanh is defined as::
-    
+
        f(x) = +1, if x  >  1
        f(x) = -1, if x  < -1
        f(x) =  x,  otherwise
@@ -118,6 +121,7 @@ class Hardtanh(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, min_value=-1, max_value=1, inplace=False):
         super(Hardtanh, self).__init__()
         self.min_val = min_value
@@ -129,12 +133,13 @@ class Hardtanh(Module):
         return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + 'min_val=' + str(self.min_val) \
             + ', max_val=' + str(self.max_val) \
             + inplace_str + ')'
 
+
 class ReLU6(Hardtanh):
     """Applies the element-wise function :math:`{ReLU6}(x) = min(max(0,x), 6)`
 
@@ -152,14 +157,16 @@ class ReLU6(Hardtanh):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, inplace=False):
         super(ReLU6, self).__init__(0, 6, inplace)
 
     def __repr__(self):
-        inplace_str='inplace' if self.inplace else ''
+        inplace_str = 'inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + inplace_str + ')'
 
+
 class Sigmoid(Module):
     """Applies the element-wise function :math:`f(x) = 1 / ( 1 + exp(-x))`
 
@@ -174,6 +181,7 @@ class Sigmoid(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return torch.sigmoid(input)
 
@@ -181,7 +189,6 @@ class Sigmoid(Module):
         return self.__class__.__name__ + ' ()'
 
 
-
 class Tanh(Module):
     """Applies element-wise, :math:`f(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`
 
@@ -196,12 +203,14 @@ class Tanh(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return torch.tanh(input)
 
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class ELU(Module):
     """Applies element-wise, :math:`f(x) = max(0,x) + min(0, alpha * (exp(x) - 1))`
 
@@ -220,6 +229,7 @@ class ELU(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, alpha=1., inplace=False):
         super(ELU, self).__init__()
         self.alpha = alpha
@@ -229,7 +239,7 @@ class ELU(Module):
         return F.elu(input, self.alpha, self.inplace)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + 'alpha=' + str(self.alpha) \
             + inplace_str + ')'
@@ -256,6 +266,7 @@ class Hardshrink(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, lambd=0.5):
         super(Hardshrink, self).__init__()
         self.lambd = lambd
@@ -286,6 +297,7 @@ class LeakyReLU(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, negative_slope=1e-2, inplace=False):
         super(LeakyReLU, self).__init__()
         self.negative_slope = negative_slope
@@ -295,11 +307,12 @@ class LeakyReLU(Module):
         return F.leaky_relu(input, self.negative_slope, self.inplace)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + str(self.negative_slope) \
             + inplace_str + ')'
 
+
 class LogSigmoid(Module):
     """Applies element-wise :math:`LogSigmoid(x) = log( 1 / (1 + exp(-x_i)))`
 
@@ -314,12 +327,14 @@ class LogSigmoid(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return F.logsigmoid(input)
 
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class Softplus(Module):
     """Applies element-wise :math:`f(x) = 1/beta * log(1 + exp(beta * x_i))`
 
@@ -344,6 +359,7 @@ class Softplus(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, beta=1, threshold=20):
         super(Softplus, self).__init__()
         self.beta = beta
@@ -357,6 +373,7 @@ class Softplus(Module):
             + 'beta=' + str(self.beta) \
             + ', threshold=' + str(self.threshold) + ')'
 
+
 class Softshrink(Module):
     """Applies the soft shrinkage function elementwise
 
@@ -379,6 +396,7 @@ class Softshrink(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, lambd=0.5):
         super(Softshrink, self).__init__()
         self.lambd = lambd
@@ -398,7 +416,7 @@ class PReLU(Module):
     across all input channels. If called with nn.PReLU(nChannels), a separate
     "a" is used for each input channel.
 
-    
+
     .. note::
         weight decay should not be used when learning "a" for good performance.
 
@@ -417,6 +435,7 @@ class PReLU(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def __init__(self, num_parameters=1, init=0.25):
         self.num_parameters = num_parameters
         super(PReLU, self).__init__()
@@ -444,6 +463,7 @@ class Softsign(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return F.softsign(input)
 
@@ -465,19 +485,21 @@ class Tanhshrink(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return F.tanhshrink(input)
 
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class Softmin(Module):
     """Applies the Softmin function to an n-dimensional input Tensor
     rescaling them so that the elements of the n-dimensional output Tensor
     lie in the range `(0, 1)` and sum to 1
 
     :math:`f(x) = exp(-x_i - {shift}) / sum_j exp(-x_j - {shift})`
-    
+
     where :math:`{shift} = max_i - x_i`
 
     Shape:
@@ -487,7 +509,7 @@ class Softmin(Module):
     Returns:
         a Tensor of the same dimension and shape as the input, with
         values in the range [0, 1]
-    
+
     Examples::
 
         >>> m = nn.Softmin()
@@ -495,12 +517,14 @@ class Softmin(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return F.softmin(input)
 
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class Softmax(Module):
     """Applies the Softmax function to an n-dimensional input Tensor
     rescaling them so that the elements of the n-dimensional output Tensor
@@ -521,7 +545,7 @@ class Softmax(Module):
         This module doesn't work directly with NLLLoss,
         which expects the Log to be computed between the Softmax and itself.
         Use Logsoftmax instead (it's faster).
-    
+
     Examples::
 
         >>> m = nn.Softmax()
@@ -529,6 +553,7 @@ class Softmax(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         assert input.dim() == 2, 'Softmax requires a 2D tensor as input'
         return F.softmax(input)
@@ -536,6 +561,7 @@ class Softmax(Module):
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class Softmax2d(Module):
     """Applies SoftMax over features to each spatial location
 
@@ -550,7 +576,7 @@ class Softmax2d(Module):
     Returns:
         a Tensor of the same dimension and shape as the input with
         values in the range [0, 1]
-        
+
     Examples::
 
         >>> m = nn.Softmax2d()
@@ -559,6 +585,7 @@ class Softmax2d(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         assert input.dim() == 4, 'Softmax2d requires a 4D tensor as input'
         return F.softmax(input)
@@ -566,10 +593,11 @@ class Softmax2d(Module):
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
 
+
 class LogSoftmax(Module):
     """Applies the Log(Softmax(x)) function to an n-dimensional input Tensor.
     The LogSoftmax formulation can be simplified as
-    
+
     :math:`f_i(x) = log(1 / a * exp(x_i))` where :math:`a = sum_j exp(x_j)`
 
     Shape:
@@ -587,9 +615,9 @@ class LogSoftmax(Module):
         >>> print(input)
         >>> print(m(input))
     """
+
     def forward(self, input):
         return F.log_softmax(input)
 
     def __repr__(self):
         return self.__class__.__name__ + ' ()'
-
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 851489082b..268d8263d9 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -61,7 +61,7 @@ class BatchNorm1d(_BatchNorm):
 
     During training, this layer keeps a running estimate of its computed mean
     and variance. The running sum is kept with a default momentum of 0.1.
-    
+
     During evaluation, this running mean/variance is used for normalization.
 
     Args:
@@ -82,6 +82,7 @@ class BatchNorm1d(_BatchNorm):
         >>> input = autograd.Variable(torch.randn(20, 100))
         >>> output = m(input)
     """
+
     def _check_input_dim(self, input):
         if input.dim() != 2 and input.dim() != 3:
             raise ValueError('expected 2D or 3D input (got {}D input)'
@@ -102,7 +103,7 @@ class BatchNorm2d(_BatchNorm):
 
     During training, this layer keeps a running estimate of its computed mean
     and variance. The running sum is kept with a default momentum of 0.1.
-    
+
     During evaluation, this running mean/variance is used for normalization.
 
     Args:
@@ -123,6 +124,7 @@ class BatchNorm2d(_BatchNorm):
         >>> input = autograd.Variable(torch.randn(20, 100, 35, 45))
         >>> output = m(input)
     """
+
     def _check_input_dim(self, input):
         if input.dim() != 4:
             raise ValueError('expected 4D input (got {}D input)'
@@ -143,7 +145,7 @@ class BatchNorm3d(_BatchNorm):
 
     During training, this layer keeps a running estimate of its computed mean
     and variance. The running sum is kept with a default momentum of 0.1.
-    
+
     During evaluation, this running mean/variance is used for normalization.
 
     Args:
@@ -164,6 +166,7 @@ class BatchNorm3d(_BatchNorm):
         >>> input = autograd.Variable(torch.randn(20, 100, 35, 45, 10))
         >>> output = m(input)
     """
+
     def _check_input_dim(self, input):
         if input.dim() != 5:
             raise ValueError('expected 5D input (got {}D input)'
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index 64a80bfa7c..5f2c4cc3a4 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -4,13 +4,14 @@ import torch
 import warnings
 from .module import Module
 
+
 class Container(Module):
 
     def __init__(self, **kwargs):
         super(Container, self).__init__()
         # DeprecationWarning is ignored by default <sigh>
         warnings.warn("nn.Container is deprecated. All of it's functionality "
-                "is now implemented in nn.Module. Subclass that instead.")
+                      "is now implemented in nn.Module. Subclass that instead.")
         for key, value in kwargs.items():
             self.add_module(key, value)
 
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 3cf8fe910b..8892ec77d1 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -7,6 +7,7 @@ from .utils import _single, _pair, _triple
 
 
 class _ConvNd(Module):
+
     def __init__(self, in_channels, out_channels, kernel_size, stride,
                  padding, dilation, transposed, output_padding, groups, bias):
         super(_ConvNd, self).__init__()
@@ -328,6 +329,7 @@ class Conv3d(_ConvNd):
 
 
 class _ConvTransposeMixin(object):
+
     def forward(self, input, output_size=None):
         output_padding = self._output_padding(input, output_size)
         func = self._backend.ConvNd(
@@ -400,6 +402,7 @@ class ConvTranspose1d(_ConvTransposeMixin, _ConvNd):
         weight (Tensor): the learnable weights of the module of shape (in_channels, out_channels, kernel_size[0], kernel_size[1])
         bias (Tensor):   the learnable bias of the module of shape (out_channels)
     """
+
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, output_padding=0, groups=1, bias=True):
         kernel_size = _single(kernel_size)
diff --git a/torch/nn/modules/dropout.py b/torch/nn/modules/dropout.py
index 904c1da5a9..f24e7e47ae 100644
--- a/torch/nn/modules/dropout.py
+++ b/torch/nn/modules/dropout.py
@@ -1,6 +1,7 @@
 from .module import Module
 from .. import functional as F
 
+
 class Dropout(Module):
     r"""Randomly zeroes some of the elements of the input tensor.
     The elements to zero are randomized on every forward call.
@@ -19,11 +20,12 @@ class Dropout(Module):
         >>> input = autograd.Variable(torch.randn(20, 16))
         >>> output = m(input)
     """
+
     def __init__(self, p=0.5, inplace=False):
         super(Dropout, self).__init__()
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
-                    "but got {}".format(p))
+                             "but got {}".format(p))
         self.p = p
         self.inplace = inplace
 
@@ -70,11 +72,12 @@ class Dropout2d(Module):
     .. _Efficient Object Localization Using Convolutional Networks:
        http://arxiv.org/abs/1411.4280
     """
+
     def __init__(self, p=0.5, inplace=False):
         super(Dropout2d, self).__init__()
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
-                    "but got {}".format(p))
+                             "but got {}".format(p))
         self.p = p
         self.inplace = inplace
 
@@ -82,11 +85,12 @@ class Dropout2d(Module):
         return self._backend.Dropout2d(self.p, self.training, self.inplace)(input)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + 'p=' + str(self.p) \
             + inplace_str + ')'
 
+
 class Dropout3d(Module):
     r"""Randomly zeroes whole channels of the input tensor.
     The channels to zero are randomized on every forward call.
@@ -120,11 +124,12 @@ class Dropout3d(Module):
     .. _Efficient Object Localization Using Convolutional Networks:
        http://arxiv.org/abs/1411.4280
     """
+
     def __init__(self, p=0.5, inplace=False):
         super(Dropout3d, self).__init__()
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
-                    "but got {}".format(p))
+                             "but got {}".format(p))
         self.p = p
         self.inplace = inplace
 
@@ -132,8 +137,7 @@ class Dropout3d(Module):
         return self._backend.Dropout3d(self.p, self.training, self.inplace)(input)
 
     def __repr__(self):
-        inplace_str=', inplace' if self.inplace else ''
+        inplace_str = ', inplace' if self.inplace else ''
         return self.__class__.__name__ + ' (' \
             + 'p=' + str(self.p) \
             + inplace_str + ')'
-
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index e07be24b60..63aa420839 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -29,6 +29,7 @@ class Linear(Module):
         >>> output = m(input)
         >>> print(output.size())
     """
+
     def __init__(self, in_features, out_features, bias=True):
         super(Linear, self).__init__()
         self.in_features = in_features
@@ -41,7 +42,7 @@ class Linear(Module):
         self.reset_parameters()
 
     def reset_parameters(self):
-        stdv = 1./math.sqrt(self.weight.size(1))
+        stdv = 1. / math.sqrt(self.weight.size(1))
         self.weight.data.uniform_(-stdv, stdv)
         if self.bias is not None:
             self.bias.data.uniform_(-stdv, stdv)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index fb2e581ab9..c676866b80 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -5,6 +5,7 @@ from .container import Sequential
 from .activation import LogSoftmax
 from .. import functional as F
 
+
 def _assert_no_grad(variable):
     assert not variable.requires_grad, \
         "nn criterions don't compute the gradient w.r.t. targets - please " \
@@ -52,7 +53,7 @@ class L1Loss(_Loss):
 
 class NLLLoss(_WeighedLoss):
     r"""The negative log likelihood loss. It is useful to train a classication problem with n classes
-    
+
     If provided, the optional argument `weights` should be a 1D Tensor assigning
     weight to each of the classes.
 
@@ -65,7 +66,7 @@ class NLLLoss(_WeighedLoss):
     adding a  `LogSoftmax`  layer in the last layer of your network.
 
     You may use `CrossEntropyLoss`  instead, if you prefer not to add an extra layer.
-    
+
     The target that this loss expects is a class index `(0 to N-1, where N = number of classes)`
 
     The loss can be described as::
@@ -201,14 +202,14 @@ class HingeEmbeddingLoss(_Loss):
     This is usually used for measuring whether two inputs are similar or dissimilar, 
     e.g. using the L1 pairwise distance, and is typically used for learning 
     nonlinear embeddings or semi-supervised learning::
-    
+
                          { x_i,                  if y_i ==  1
         loss(x, y) = 1/n {
                          { max(0, margin - x_i), if y_i == -1
-    
+
     `x` and `y` arbitrary shapes with a total of `n` elements each
     the sum operation still operates over all the elements, and divides by `n`.
-    
+
     The division by `n` can be avoided if one sets the internal variable `sizeAverage=False`.
 
     The `margin` has a default value of `1`, or can be set in the constructor.
@@ -221,9 +222,9 @@ class MultiLabelMarginLoss(_Loss):
     hinge loss (margin-based loss) between input `x`  (a 2D mini-batch `Tensor`) and 
     output `y` (which is a 2D `Tensor` of target class indices).
     For each sample in the mini-batch::
-    
+
         loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0)
-    
+
     where `i == 0` to `x.size(0)`, `j == 0` to `y.size(0)`, 
     `y[j] != 0`, and `i != y[j]` for all `i` and `j`.
 
@@ -242,11 +243,11 @@ class SmoothL1Loss(_Loss):
     It is less sensitive to outliers than the `MSELoss` and in some cases 
     prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
     Also known as the Huber loss::
-    
+
                               { 0.5 * (x_i - y_i)^2, if |x_i - y_i| < 1
         loss(x, y) = 1/n \sum {
                               { |x_i - y_i| - 0.5,   otherwise
-    
+
     `x` and `y` arbitrary shapes with a total of `n` elements each
     the sum operation still operates over all the elements, and divides by `n`.
 
@@ -260,11 +261,11 @@ class SoftMarginLoss(_Loss):
     r"""Creates a criterion that optimizes a two-class classification 
     logistic loss between input `x` (a 2D mini-batch Tensor) and 
     target `y` (which is a tensor containing either `1` or `-1`).
-    
+
     ::
-    
+
         loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x.nelement()
-    
+
     The normalization by the number of elements in the input can be disabled by
     setting `self.sizeAverage` to `False`.
     """
@@ -287,7 +288,7 @@ class CrossEntropyLoss(_WeighedLoss):
     `target` for each value of a 1D tensor of size `n`
 
     The loss can be described as::
-    
+
         loss(x, class) = -log(exp(x[class]) / (\sum_j exp(x[j])))
                        = -x[class] + log(\sum_j exp(x[j]))
 
@@ -302,25 +303,28 @@ class CrossEntropyLoss(_WeighedLoss):
         - Target: :math:`(N)` where each value is `0 <= targets[i] <= C-1`
 
     """
+
     def forward(self, input, target):
         _assert_no_grad(target)
         return F.cross_entropy(input, target,
-                self.weight, self.size_average)
+                               self.weight, self.size_average)
+
 
 class MultiLabelSoftMarginLoss(_WeighedLoss):
     r"""Creates a criterion that optimizes a multi-label one-versus-all 
     loss based on max-entropy, between input `x`  (a 2D mini-batch `Tensor`) and 
     target `y` (a binary 2D `Tensor`). For each sample in the minibatch::
-    
+
        loss(x, y) = - sum_i (y[i] log( exp(x[i]) / (1 + exp(x[i]))) 
                              + (1-y[i]) log(1/(1+exp(x[i])))) / x:nElement()
-    
+
     where `i == 0` to `x.nElement()-1`, `y[i]  in {0,1}`.
     `y` and `x` must have the same size.
     """
+
     def forward(self, input, target):
         return F.binary_cross_entropy(torch.sigmoid(input), target,
-                self.weight, self.size_average)
+                                      self.weight, self.size_average)
 
 
 class CosineEmbeddingLoss(Module):
@@ -334,16 +338,17 @@ class CosineEmbeddingLoss(Module):
     If `margin` is missing, the default value is `0`.
 
     The loss function for each sample is::
-    
+
                      { 1 - cos(x1, x2),              if y ==  1
         loss(x, y) = {
                      { max(0, cos(x1, x2) - margin), if y == -1
-    
+
     If the internal variable `sizeAverage` is equal to `True`, 
     the loss function averages the loss over the batch samples; 
     if `sizeAverage` is `False`, then the loss function sums over the 
     batch samples. By default, `sizeAverage = True`.
     """
+
     def __init__(self, margin=0, size_average=True):
         super(CosineEmbeddingLoss, self).__init__()
         self.margin = margin
@@ -351,7 +356,7 @@ class CosineEmbeddingLoss(Module):
 
     def forward(self, input1, input2, target):
         return self._backend.CosineEmbeddingLoss(self.margin,
-                self.size_average)(input1, input2, target)
+                                                 self.size_average)(input1, input2, target)
 
 
 class MarginRankingLoss(Module):
@@ -363,14 +368,15 @@ class MarginRankingLoss(Module):
     (have a larger value) than the second input, and vice-versa for `y == -1`.
 
     The loss function for each sample in the mini-batch is::
-    
+
         loss(x, y) = max(0, -y * (x1 - x2) + margin)
-    
+
     if the internal variable `sizeAverage = True`, 
     the loss function averages the loss over the batch samples; 
     if `sizeAverage = False`, then the loss function sums over the batch samples. 
     By default, `sizeAverage` equals to `True`.
     """
+
     def __init__(self, margin=0, size_average=True):
         super(MarginRankingLoss, self).__init__()
         self.margin = margin
@@ -378,7 +384,7 @@ class MarginRankingLoss(Module):
 
     def forward(self, input1, input2, target):
         return self._backend.MarginRankingLoss(self.margin,
-                self.size_average)(input1, input2, target)
+                                               self.size_average)(input1, input2, target)
 
 
 class MultiMarginLoss(Module):
@@ -401,6 +407,7 @@ class MultiMarginLoss(Module):
     However, if the field `sizeAverage` is set to `False`, 
     the losses are instead summed.
     """
+
     def __init__(self, p=1, margin=1, weight=None, size_average=True):
         super(MultiMarginLoss, self).__init__()
         if p != 1 and p != 2:
@@ -413,7 +420,7 @@ class MultiMarginLoss(Module):
 
     def forward(self, input, target):
         return self._backend.MultiMarginLoss(self.size_average, self.p,
-                self.margin, weight=self.weight)(input, target)
+                                             self.margin, weight=self.weight)(input, target)
 
 
 # TODO: L1HingeEmbeddingCriterion
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index ef085ad80c..737fd625ab 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -60,7 +60,7 @@ class Module(object):
             if not isinstance(param, Parameter):
                 if isinstance(param, Variable):
                     raise TypeError("can't use a Variable as a module "
-                        "parameter.  Convert it to torch.nn.Parameter first.")
+                                    "parameter.  Convert it to torch.nn.Parameter first.")
                 if param is not None:
                     param = Parameter(param)
             self._parameters[name] = param
@@ -398,6 +398,6 @@ class Module(object):
         for key, module in self._modules.items():
             modstr = module.__repr__()
             modstr = _addindent(modstr, 2)
-            tmpstr = tmpstr + '  ('  + key + '): ' + modstr + '\n'
+            tmpstr = tmpstr + '  (' + key + '): ' + modstr + '\n'
         tmpstr = tmpstr + ')'
         return tmpstr
diff --git a/torch/nn/modules/normalization.py b/torch/nn/modules/normalization.py
index 74e29ce599..e2bb96c1c6 100644
--- a/torch/nn/modules/normalization.py
+++ b/torch/nn/modules/normalization.py
@@ -2,6 +2,7 @@ from .module import Module
 
 
 class CrossMapLRN2d(Module):
+
     def __init__(self, size, alpha=1e-4, beta=0.75, k=1):
         super(CrossMapLRN2d, self).__init__()
         self.size = size
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index 31127481a4..e02c3fd05f 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -2,6 +2,8 @@ from .module import Module
 from .utils import _quadruple, _ntuple
 
 # TODO: grad_output size asserts in THNN
+
+
 class ReflectionPad2d(Module):
 
     def __init__(self, padding):
@@ -14,6 +16,7 @@ class ReflectionPad2d(Module):
     def __repr__(self):
         return self.__class__.__name__ + ' ' + str(self.padding)
 
+
 class ReplicationPad2d(Module):
 
     def __init__(self, padding):
@@ -26,6 +29,7 @@ class ReplicationPad2d(Module):
     def __repr__(self):
         return self.__class__.__name__ + ' ' + str(self.padding)
 
+
 class ReplicationPad3d(Module):
 
     def __init__(self, padding):
@@ -39,4 +43,3 @@ class ReplicationPad3d(Module):
         return self.__class__.__name__ + ' ' + str(self.padding)
 
 # TODO: ZeroPad2d
-
diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py
index 67f7ac13ab..cfc6df2e5b 100644
--- a/torch/nn/modules/pixelshuffle.py
+++ b/torch/nn/modules/pixelshuffle.py
@@ -5,7 +5,7 @@ from .. import functional as F
 class PixelShuffle(Module):
     r"""Rearranges elements in a Tensor of shape :math:`(*, C * r^2, H, W]` to a
     tensor of shape :math:`(C, H * r, W * r)`. 
-    
+
     This is useful for implementing efficient sub-pixel convolution 
     with a stride of :math:`1/r`.
 
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index aae6ab0989..26fbba916c 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -48,6 +48,7 @@ class MaxPool1d(Module):
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
     """
+
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
                  return_indices=False, ceil_mode=False):
         super(MaxPool1d, self).__init__()
@@ -60,8 +61,8 @@ class MaxPool1d(Module):
 
     def forward(self, input):
         return F.max_pool1d(input, self.kernel_size, self.stride,
-                self.padding, self.dilation, self.ceil_mode,
-                self.return_indices)
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
 
     def __repr__(self):
         return self.__class__.__name__ + ' (' \
@@ -71,6 +72,7 @@ class MaxPool1d(Module):
             + ', dilation=' + str(self.dilation) \
             + ', ceil_mode=' + str(self.ceil_mode) + ')'
 
+
 class MaxPool2d(Module):
     r"""Applies a 2D max pooling over an input signal composed of several input
     planes.
@@ -124,6 +126,7 @@ class MaxPool2d(Module):
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
     """
+
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
                  return_indices=False, ceil_mode=False):
         super(MaxPool2d, self).__init__()
@@ -136,17 +139,17 @@ class MaxPool2d(Module):
 
     def forward(self, input):
         return F.max_pool2d(input, self.kernel_size, self.stride,
-                self.padding, self.dilation, self.ceil_mode,
-                self.return_indices)
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
 
     def __repr__(self):
         kh, kw = _pair(self.kernel_size)
         dh, dw = _pair(self.stride)
         padh, padw = _pair(self.padding)
         dilh, dilw = _pair(self.dilation)
-        padding_str=', padding=(' + str(padh) + ', ' + str(padw) + ')' \
-                      if padh != 0 and padw !=0 else ''
-        dilation_str=(', dilation=(' + str(dilh) + ', ' + str(dilw) + ')' \
+        padding_str = ', padding=(' + str(padh) + ', ' + str(padw) + ')' \
+            if padh != 0 and padw != 0 else ''
+        dilation_str = (', dilation=(' + str(dilh) + ', ' + str(dilw) + ')'
                         if dilh != 0 and dilw != 0 else '')
         return  self.__class__.__name__ + ' (' \
             + 'size=(' + str(kh) + ', ' + str(kw) + ')' \
@@ -185,6 +188,7 @@ class MaxUnpool1d(Module):
            0   2   0   4   0   6   0   8
         [torch.FloatTensor of size 1x1x8]
     """
+
     def __init__(self, kernel_size, stride=None, padding=0):
         super(MaxUnpool1d, self).__init__()
         self.kernel_size = _single(kernel_size)
@@ -234,6 +238,7 @@ class MaxUnpool2d(Module):
            0  14   0  16
         [torch.FloatTensor of size 1x1x4x4]
     """
+
     def __init__(self, kernel_size, stride=None, padding=0):
         super(MaxUnpool2d, self).__init__()
         self.kernel_size = _pair(kernel_size)
@@ -276,6 +281,7 @@ class MaxUnpool3d(Module):
         >>> unpooled_output.size()
         torch.Size([20, 16, 51, 33, 15])
     """
+
     def __init__(self, kernel_size, stride=None, padding=0):
         super(MaxUnpool3d, self).__init__()
         self.kernel_size = _triple(kernel_size)
@@ -330,6 +336,7 @@ class AvgPool1d(Module):
           2  4  6
         [torch.FloatTensor of size 1x1x3]
     """
+
     def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
                  count_include_pad=True):
         super(AvgPool1d, self).__init__()
@@ -391,8 +398,9 @@ class AvgPool2d(Module):
         >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
         >>> output = m(input)
     """
+
     def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False,
-            count_include_pad=True):
+                 count_include_pad=True):
         super(AvgPool2d, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride or kernel_size
@@ -402,7 +410,7 @@ class AvgPool2d(Module):
 
     def forward(self, input):
         return F.avg_pool2d(input, self.kernel_size, self.stride,
-                self.padding, self.ceil_mode, self.count_include_pad)
+                            self.padding, self.ceil_mode, self.count_include_pad)
 
 
 class MaxPool3d(Module):
@@ -459,8 +467,9 @@ class MaxPool3d(Module):
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
     """
+
     def __init__(self, kernel_size, stride=None, padding=0, dilation=1,
-            return_indices=False, ceil_mode=False):
+                 return_indices=False, ceil_mode=False):
         super(MaxPool3d, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride or kernel_size
@@ -471,8 +480,9 @@ class MaxPool3d(Module):
 
     def forward(self, input):
         return F.max_pool3d(input, self.kernel_size, self.stride,
-                self.padding, self.dilation, self.ceil_mode,
-                self.return_indices)
+                            self.padding, self.dilation, self.ceil_mode,
+                            self.return_indices)
+
 
 class AvgPool3d(Module):
     r"""Applies a 3D average pooling over an input signal composed of several input
@@ -515,6 +525,7 @@ class AvgPool3d(Module):
         >>> input = autograd.Variable(torch.randn(20, 16, 50,44, 31))
         >>> output = m(input)
     """
+
     def __init__(self, kernel_size, stride=None):
         super(AvgPool3d, self).__init__()
         self.kernel_size = kernel_size
@@ -554,8 +565,9 @@ class FractionalMaxPool2d(Module):
     .. _Fractional MaxPooling:
         http://arxiv.org/abs/1412.6071
     """
+
     def __init__(self, kernel_size, output_size=None, output_ratio=None,
-            return_indices=False, _random_samples=None):
+                 return_indices=False, _random_samples=None):
         super(FractionalMaxPool2d, self).__init__()
         self.kh, self.kw = _pair(kernel_size)
         self.return_indices = return_indices
@@ -572,7 +584,7 @@ class FractionalMaxPool2d(Module):
             assert 0 < self.rw < 1
         else:
             raise ValueError("FractionalMaxPool2d requires specifying either "
-                "an output size, or a pooling ratio")
+                             "an output size, or a pooling ratio")
 
     def forward(self, input):
         kwargs = {}
@@ -581,8 +593,8 @@ class FractionalMaxPool2d(Module):
         else:
             kwargs['output_ratio'] = self.rh, self.rw
         func = self._backend.FractionalMaxPool2d(self.kw, self.kh,
-                return_indices=self.return_indices,
-                _random_samples=self._random_samples, **kwargs)
+                                                 return_indices=self.return_indices,
+                                                 _random_samples=self._random_samples, **kwargs)
         return func(input)
 
 
@@ -632,7 +644,7 @@ class LPPool2d(Module):
 
     def forward(self, input):
         return F.lp_pool2d(input, self.norm_type, self.kernel_size,
-                self.stride, self.ceil_mode)
+                           self.stride, self.ceil_mode)
 
 
 # TODO: AdaptiveMaxPool2d
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index b67edb49db..78ddb0c6f0 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -206,6 +206,7 @@ class LSTM(RNNBase):
         >>> c0 = Variable(torch.randn(2, 3, 20))
         >>> output, hn = rnn(input, (h0, c0))
     """
+
     def __init__(self, *args, **kwargs):
         super(LSTM, self).__init__('LSTM', *args, **kwargs)
 
@@ -403,11 +404,11 @@ class LSTMCell(RNNCellBase):
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
-        self.weight_ih = Parameter(torch.Tensor(4*hidden_size, input_size))
-        self.weight_hh = Parameter(torch.Tensor(4*hidden_size, hidden_size))
+        self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
+        self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
         if bias:
-            self.bias_ih = Parameter(torch.Tensor(4*hidden_size))
-            self.bias_hh = Parameter(torch.Tensor(4*hidden_size))
+            self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
+            self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
         else:
             self.register_parameter('bias_ih', None)
             self.register_parameter('bias_hh', None)
@@ -428,7 +429,7 @@ class LSTMCell(RNNCellBase):
 
 class GRUCell(RNNCellBase):
     r"""A gated recurrent unit (GRU) cell
-    
+
     .. math::
 
         \begin{array}{ll}
@@ -472,11 +473,11 @@ class GRUCell(RNNCellBase):
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
-        self.weight_ih = Parameter(torch.Tensor(3*hidden_size, input_size))
-        self.weight_hh = Parameter(torch.Tensor(3*hidden_size, hidden_size))
+        self.weight_ih = Parameter(torch.Tensor(3 * hidden_size, input_size))
+        self.weight_hh = Parameter(torch.Tensor(3 * hidden_size, hidden_size))
         if bias:
-            self.bias_ih = Parameter(torch.Tensor(3*hidden_size))
-            self.bias_hh = Parameter(torch.Tensor(3*hidden_size))
+            self.bias_ih = Parameter(torch.Tensor(3 * hidden_size))
+            self.bias_hh = Parameter(torch.Tensor(3 * hidden_size))
         else:
             self.register_parameter('bias_ih', None)
             self.register_parameter('bias_hh', None)
diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index f56ced86c3..6af3c90616 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -62,6 +62,7 @@ class Embedding(Module):
         [torch.FloatTensor of size 1x4x3]
 
     """
+
     def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
                  max_norm=None, norm_type=2, scale_grad_by_freq=False,
                  sparse=False):
@@ -89,7 +90,7 @@ class Embedding(Module):
         return self._backend.Embedding(
             padding_idx, self.max_norm, self.norm_type,
             self.scale_grad_by_freq, self.sparse
-            )(input, self.weight)
+        )(input, self.weight)
 
     def __repr__(self):
         s = '{name}({num_embeddings}, {embedding_dim}'
diff --git a/torch/nn/modules/upsampling.py b/torch/nn/modules/upsampling.py
index 95bb0c8857..96ec32fd9e 100644
--- a/torch/nn/modules/upsampling.py
+++ b/torch/nn/modules/upsampling.py
@@ -6,6 +6,7 @@ from .utils import _pair
 
 
 class _UpsamplingBase(Module):
+
     def __init__(self, size=None, scale_factor=None):
         super(_UpsamplingBase, self).__init__()
         if size is None and scale_factor is None:
@@ -17,10 +18,12 @@ class _UpsamplingBase(Module):
 
 
 class UpsamplingNearest2d(_UpsamplingBase):
+
     def forward(self, input):
         return F.upsample_nearest(input, self.size, self.scale_factor)
 
 
 class UpsamplingBillinear2d(_UpsamplingBase):
+
     def forward(self, input):
         return F.upsample_billinear(input, self.size, self.scale_factor)
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index feb6c56d98..c3dc5126e6 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -1,6 +1,7 @@
 import collections
 from itertools import repeat
 
+
 def _ntuple(n):
     def parse(x):
         if isinstance(x, collections.Iterable):
diff --git a/torch/nn/parallel/_functions.py b/torch/nn/parallel/_functions.py
index 5c40af2b81..c815c08eae 100644
--- a/torch/nn/parallel/_functions.py
+++ b/torch/nn/parallel/_functions.py
@@ -33,7 +33,7 @@ class Gather(Function):
 
     def backward(self, grad_output):
         return comm.scatter(grad_output, self.input_gpus, self.input_sizes,
-                self.dim)
+                            self.dim)
 
 
 class Scatter(Function):
diff --git a/torch/nn/parallel/parallel_apply.py b/torch/nn/parallel/parallel_apply.py
index fc601cef74..32df2b1ff5 100644
--- a/torch/nn/parallel/parallel_apply.py
+++ b/torch/nn/parallel/parallel_apply.py
@@ -32,7 +32,7 @@ def parallel_apply(modules, inputs):
 
     threads = [threading.Thread(target=_worker,
                                 args=(module, input, results, lock))
-                for module, input in zip(modules, inputs)]
+               for module, input in zip(modules, inputs)]
 
     for thread in threads:
         thread.start()
@@ -45,4 +45,3 @@ def parallel_apply(modules, inputs):
             raise output
         outputs.append(output)
     return outputs
-
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 0a555f2b70..f1d7b3f809 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -1,5 +1,6 @@
 from .optimizer import Optimizer
 
+
 class Adadelta(Optimizer):
     """Implements Adadelta algorithm.
 
@@ -61,4 +62,3 @@ class Adadelta(Optimizer):
                 acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta)
 
         return loss
-
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index d273f113a5..9d7448c4c1 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -1,5 +1,6 @@
 from .optimizer import Optimizer
 
+
 class Adagrad(Optimizer):
     """Implements Adagrad algorithm.
 
@@ -53,4 +54,3 @@ class Adagrad(Optimizer):
                 p.data.addcdiv_(-clr, grad, std)
 
         return loss
-
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 5b77e42c78..38756edc73 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,6 +1,7 @@
 import math
 from .optimizer import Optimizer
 
+
 class Adam(Optimizer):
     """Implements Adam algorithm.
 
@@ -21,9 +22,9 @@ class Adam(Optimizer):
     """
 
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-            weight_decay=0):
+                 weight_decay=0):
         defaults = dict(lr=lr, betas=betas, eps=eps,
-                weight_decay=weight_decay)
+                        weight_decay=weight_decay)
         super(Adam, self).__init__(params, defaults)
 
     def step(self, closure=None):
@@ -71,4 +72,3 @@ class Adam(Optimizer):
                 p.data.addcdiv_(-step_size, exp_avg, denom)
 
         return loss
-
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index d939f8729d..93997be24a 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -1,6 +1,7 @@
 import torch
 from .optimizer import Optimizer
 
+
 class Adamax(Optimizer):
     """Implements Adamax algorithm (a variant of Adam based on infinity norm).
 
@@ -21,7 +22,7 @@ class Adamax(Optimizer):
     """
 
     def __init__(self, params, lr=1e-2, betas=(0.9, 0.999), eps=1e-38,
-            weight_decay=0):
+                 weight_decay=0):
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
         super(Adamax, self).__init__(params, defaults)
 
@@ -71,5 +72,3 @@ class Adamax(Optimizer):
                 p.data.addcdiv_(-clr, exp_avg, exp_inf)
 
         return loss
-
-
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 37fe336c8a..0abdd6a80f 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -22,7 +22,7 @@ class ASGD(Optimizer):
 
     def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
         defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
-                weight_decay=weight_decay)
+                        weight_decay=weight_decay)
         super(ASGD, self).__init__(params, defaults)
 
     def step(self, closure=None):
@@ -67,8 +67,7 @@ class ASGD(Optimizer):
 
                 # update eta and mu
                 state['eta'] = (group['lr'] /
-                    math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
+                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
                 state['mu'] = 1 / max(1, state['step'] - group['t0'])
 
         return loss
-
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 6971d8edaf..36bd4d29e8 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -33,8 +33,8 @@ class LBFGS(Optimizer):
     """
 
     def __init__(self, params, lr=1, max_iter=20, max_eval=None,
-            tolerance_grad=1e-5, tolerance_change=1e-9, history_size=100,
-            line_search_fn=None):
+                 tolerance_grad=1e-5, tolerance_change=1e-9, history_size=100,
+                 line_search_fn=None):
         if max_eval is None:
             max_eval = max_iter * 5 // 4
         defaults = dict(lr=lr, max_iter=max_iter, max_eval=max_eval,
@@ -44,7 +44,7 @@ class LBFGS(Optimizer):
 
         if len(self.param_groups) != 1:
             raise ValueError("LBFGS doesn't support per-parameter options "
-                    "(parameter groups)")
+                             "(parameter groups)")
 
         self._params = self.param_groups[0]['params']
         self._numel_cache = None
@@ -56,13 +56,13 @@ class LBFGS(Optimizer):
 
     def _gather_flat_grad(self):
         return torch.cat(
-                tuple(param.grad.data.view(-1) for param in self._params), 0)
+            tuple(param.grad.data.view(-1) for param in self._params), 0)
 
     def _add_grad(self, step_size, update):
         offset = 0
         for p in self._params:
             numel = p.numel()
-            p.data.add_(step_size, update[offset:offset+numel])
+            p.data.add_(step_size, update[offset:offset + numel])
             offset += numel
         assert offset == self._numel()
 
@@ -158,7 +158,7 @@ class LBFGS(Optimizer):
 
                 # iteration in L-BFGS loop collapsed to use just one buffer
                 q = flat_grad.neg()
-                for i in range(num_old-1, -1, -1):
+                for i in range(num_old - 1, -1, -1):
                     al[i] = old_dirs[i].dot(q) * ro[i]
                     q.add_(-al[i], old_stps[i])
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 6eaf68e9d4..599143a6d9 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -21,8 +21,8 @@ class Optimizer(object):
     def __init__(self, params, defaults):
         if isinstance(params, Variable) or torch.is_tensor(params):
             raise TypeError("params argument given to the optimizer should be "
-                "an iterable of Variables or dicts, but got " +
-                torch.typename(params))
+                            "an iterable of Variables or dicts, but got " +
+                            torch.typename(params))
 
         self.state = defaultdict(dict)
         self.param_groups = list(params)
@@ -37,15 +37,15 @@ class Optimizer(object):
             group_set = set(group['params'])
             if not param_set.isdisjoint(group_set):
                 raise ValueError("some parameters appear in more than one "
-                        "parameter group")
+                                 "parameter group")
             param_set.update(group_set)
 
         for name, default in defaults.items():
             for i, group in enumerate(self.param_groups):
                 if default is required and name not in group:
                     raise ValueError("parameter group " + str(i) + " didn't "
-                        "specify a value of required optimization parameter "
-                        + name)
+                                     "specify a value of required optimization parameter "
+                                     + name)
                 else:
                     group.setdefault(name, default)
 
@@ -53,10 +53,10 @@ class Optimizer(object):
             for param in group['params']:
                 if not isinstance(param, Variable):
                     raise TypeError("optimizer can only optimize Variables, "
-                        "but one of the params is " + torch.typename(param))
+                                    "but one of the params is " + torch.typename(param))
                 if not param.requires_grad:
                     raise ValueError("optimizing a parameter that doesn't "
-                        "require gradients")
+                                     "require gradients")
                 if param.creator is not None:
                     raise ValueError("can't optimize a non-leaf Variable")
 
@@ -104,17 +104,17 @@ class Optimizer(object):
 
         if len(groups) != len(saved_groups):
             raise ValueError("loaded state dict has a different number of "
-                    "parameter groups")
+                             "parameter groups")
         param_lens = (len(g['params']) for g in groups)
         saved_lens = (len(g['params']) for g in saved_groups)
         if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
             raise ValueError("loaded state dict contains a parameter group "
-                "that doesn't match the size of optimizer's group")
+                             "that doesn't match the size of optimizer's group")
 
         # Update the state
         id_map = {old_id: p for old_id, p in
-                    zip(chain(*(g['params'] for g in saved_groups)),
-                        chain(*(g['params'] for g in groups)))}
+                  zip(chain(*(g['params'] for g in saved_groups)),
+                      chain(*(g['params'] for g in groups)))}
         self.state = {id_map.get(k, k): v for k, v in state_dict['state'].items()}
 
         # Update parameter groups, setting their 'params' value
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 1b1af293f7..00742d1e98 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -1,5 +1,6 @@
 from .optimizer import Optimizer
 
+
 class RMSprop(Optimizer):
     """Implements RMSprop algorithm.
 
@@ -52,6 +53,3 @@ class RMSprop(Optimizer):
                 p.data.addcdiv_(-group['lr'], grad, avg)
 
         return loss
-
-
-
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 98c0939f47..93403d1f78 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -1,6 +1,7 @@
 import math
 from .optimizer import Optimizer
 
+
 class Rprop(Optimizer):
     """Implements the resilient backpropagation algorithm.
 
@@ -65,4 +66,3 @@ class Rprop(Optimizer):
                 state['prev'].copy_(grad)
 
         return loss
-
diff --git a/torch/serialization.py b/torch/serialization.py
index 1724839a5d..e41c137427 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -83,7 +83,7 @@ def location_tag(storage):
         if location:
             return location
     raise RuntimeError("don't know how to determine data location of " +
-            torch.typename(storage))
+                       torch.typename(storage))
 
 
 def default_restore_location(storage, location):
@@ -92,7 +92,7 @@ def default_restore_location(storage, location):
         if result is not None:
             return result
     raise RuntimeError("don't know how to restore data location of " +
-            torch.typename(storage) + " (tagged with " + location + ")")
+                       torch.typename(storage) + " (tagged with " + location + ")")
 
 
 def normalize_storage_type(storage_type):
@@ -143,8 +143,8 @@ def _save(obj, f, pickle_module, pickle_protocol):
                 source = inspect.getsource(obj)
             except (TypeError, IOError):
                 warnings.warn("Couldn't retrieve source code for container of "
-                        "type " + obj.__name__ + ". It won't be checked "
-                        "for correctness upon loading.")
+                              "type " + obj.__name__ + ". It won't be checked "
+                              "for correctness upon loading.")
             return (obj, source_file, source)
         if torch.is_tensor(obj):
             serialized_tensors[obj._cdata] = obj
@@ -165,7 +165,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
                 storage_id = None
 
             pickle_module.dump((key, storage_id, type(tensor)), f,
-                    protocol=pickle_protocol)
+                               protocol=pickle_protocol)
             f.flush()
             tensor._write_metadata(f)
 
@@ -178,7 +178,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
             if root is not storage:
                 storage_views_roots[root._cdata] = root
                 storage_views.append((storage._cdata, root._cdata, offset,
-                    storage.size()))
+                                      storage.size()))
         for view_info in storage_views:
             del serialized_storages[view_info[0]]
         serialized_storages.update(storage_views_roots)
@@ -188,7 +188,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
             location = location_tag(storage)
             storage_type = normalize_storage_type(type(storage))
             pickle_module.dump((key, location, storage_type), f,
-                    protocol=pickle_protocol)
+                               protocol=pickle_protocol)
             f.flush()
             storage._write_file(f)
 
@@ -203,7 +203,7 @@ def _save(obj, f, pickle_module, pickle_protocol):
         sys_info = dict(
             protocol_version=1000,
             little_endian=sys.byteorder == 'little',
-            type_sizes = dict(
+            type_sizes=dict(
                 short=SHORT_SIZE,
                 int=INT_SIZE,
                 long=LONG_SIZE,
@@ -273,10 +273,10 @@ def _load(f, map_location, pickle_module):
             if container_type.dump_patches:
                 file_name = container_type.__name__ + '.patch'
                 diff = difflib.unified_diff(
-                        current_source.split('\n'),
-                        original_source.split('\n'),
-                        source_file,
-                        source_file, lineterm="")
+                    current_source.split('\n'),
+                    original_source.split('\n'),
+                    source_file,
+                    source_file, lineterm="")
                 lines = '\n'.join(diff)
                 try:
                     with open(file_name, 'a+') as f:
@@ -312,7 +312,7 @@ def _load(f, map_location, pickle_module):
         return deserialized_objects[int(saved_id)]
 
     with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as tar, \
-         mkdtemp() as tmpdir:
+            mkdtemp() as tmpdir:
 
         tar.extract('storages', path=tmpdir)
         with open(os.path.join(tmpdir, 'storages'), 'rb', 0) as f:
@@ -327,7 +327,7 @@ def _load(f, map_location, pickle_module):
             storage_views = pickle_module.load(f)
             for target_cdata, root_cdata, offset, size in storage_views:
                 root = deserialized_objects[root_cdata]
-                deserialized_objects[target_cdata] = root[offset:offset+size]
+                deserialized_objects[target_cdata] = root[offset:offset + size]
 
         tar.extract('tensors', path=tmpdir)
         with open(os.path.join(tmpdir, 'tensors'), 'rb', 0) as f:
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 3c5a50ed8a..b4c8bbe3fb 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -5,26 +5,46 @@ import sys
 
 _sparse_tensor_classes = set()
 
+
 class DoubleTensor(_C.SparseDoubleTensorBase):
+
     def is_signed(self):
         return True
+
+
 class FloatTensor(_C.SparseFloatTensorBase):
+
     def is_signed(self):
         return True
+
+
 class LongTensor(_C.SparseLongTensorBase):
+
     def is_signed(self):
         return True
+
+
 class IntTensor(_C.SparseIntTensorBase):
+
     def is_signed(self):
         return True
+
+
 class ShortTensor(_C.SparseShortTensorBase):
+
     def is_signed(self):
         return True
+
+
 class CharTensor(_C.SparseCharTensorBase):
+
     def is_signed(self):
         # TODO
         return False
+
+
 class ByteTensor(_C.SparseByteTensorBase):
+
     def is_signed(self):
         return False
 
diff --git a/torch/tensor.py b/torch/tensor.py
index 14147391df..787c5dc35f 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -343,7 +343,7 @@ class _TensorBase(object):
 
         xtensor = src.new().set_(src)
         xsize = list(xtensor.size())
-        for i in _range(len(repeats)-src.dim()):
+        for i in _range(len(repeats) - src.dim()):
             xsize = [1] + xsize
 
         size = torch.Size([a * b for a, b in zip(xsize, repeats)])
@@ -351,8 +351,8 @@ class _TensorBase(object):
         result.resize_(size)
         urtensor = result.new(result)
         for i in _range(xtensor.dim()):
-            urtensor = urtensor.unfold(i,xtensor.size(i),xtensor.size(i))
-        for i in _range(urtensor.dim()-xtensor.dim()):
+            urtensor = urtensor.unfold(i, xtensor.size(i), xtensor.size(i))
+        for i in _range(urtensor.dim() - xtensor.dim()):
             xsize = [1] + xsize
         xtensor.resize_(torch.Size(xsize))
         xxtensor = xtensor.expand_as(urtensor)
@@ -391,7 +391,7 @@ class _TensorBase(object):
         return self.set_(self.storage(), self.storage_offset(),
                          torch.Size(sizes), tuple(strides))
 
-    #TODO: add tests for operators
+    # TODO: add tests for operators
     def __add__(self, other):
         return self.add(other)
     __radd__ = __add__
@@ -430,7 +430,7 @@ class _TensorBase(object):
         elif dim_self == 2 and dim_other == 2:
             return self.mm(other)
         raise ValueError("both arguments to __matmul__ need to be 1D or 2D, "
-                "but they are {}D and {}D".format(dim_self, dim_other))
+                         "but they are {}D and {}D".format(dim_self, dim_other))
 
     def __pow__(self, other):
         return self.pow(other)
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index 8b13789179..e69de29bb2 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -1 +0,0 @@
-
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index bdf8d52915..cd9506d3cf 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -39,4 +39,3 @@ class TensorDataset(Dataset):
 
     def __len__(self):
         return self.data_tensor.size(0)
-
diff --git a/torch/utils/ffi/__init__.py b/torch/utils/ffi/__init__.py
index 4c5b620e0e..78c48c8e0b 100644
--- a/torch/utils/ffi/__init__.py
+++ b/torch/utils/ffi/__init__.py
@@ -14,9 +14,9 @@ except ImportError:
     raise ImportError("torch.utils.ffi requires the cffi package")
 
 
-if cffi.__version_info__ < (1,4,0):
+if cffi.__version_info__ < (1, 4, 0):
     raise ImportError("torch.utils.ffi requires cffi version >= 1.4, but "
-            "got " + '.'.join(map(str, cffi.__version_info__)))
+                      "got " + '.'.join(map(str, cffi.__version_info__)))
 
 
 def _generate_typedefs():
@@ -106,13 +106,13 @@ def _build_extension(ffi, cffi_wrapper_name, target_dir, verbose):
 
 def _make_python_wrapper(name, cffi_wrapper_name, target_dir):
     py_source = PY_MODULE_TEMPLATE.substitute(name=name,
-            cffi_wrapper_name=cffi_wrapper_name)
+                                              cffi_wrapper_name=cffi_wrapper_name)
     with open(os.path.join(target_dir, '__init__.py'), 'w') as f:
         f.write(py_source)
 
 
 def create_extension(name, headers, sources, verbose=True, with_cuda=False,
-        package=False, relative_to='.', **kwargs):
+                     package=False, relative_to='.', **kwargs):
     """Creates and configures a cffi.FFI object, that builds PyTorch extension.
 
     Arguments:
@@ -159,6 +159,7 @@ def create_extension(name, headers, sources, verbose=True, with_cuda=False,
     ffi.cdef(_typedefs + all_headers_source)
 
     _make_python_wrapper(name_suffix, '_' + name_suffix, target_dir)
+
     def build():
         _build_extension(ffi, cffi_wrapper_name, target_dir, verbose)
     ffi.build = build
@@ -169,9 +170,9 @@ def _wrap_function(function, ffi):
     @wraps(function)
     def safe_call(*args, **kwargs):
         args = tuple(ffi.cast(_torch_to_cffi.get(type(arg), 'void') + '*', arg._cdata)
-                if torch.is_tensor(arg) or torch.is_storage(arg)
-                else arg
-                for arg in args)
+                     if torch.is_tensor(arg) or torch.is_storage(arg)
+                     else arg
+                     for arg in args)
         args = (function,) + args
         result = torch._C._safe_call(*args, **kwargs)
         if isinstance(result, ffi.CData):
@@ -183,4 +184,3 @@ def _wrap_function(function, ffi):
                     return _cffi_to_torch[cname](cdata=cdata)
         return result
     return safe_call
-
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 0a09505825..8d9ab4890b 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -3,6 +3,7 @@ import weakref
 
 class RemovableHandle(object):
     """A handle which provides the capability to remove a hook."""
+
     def __init__(self, hooks_dict):
         self.hooks_dict_ref = weakref.ref(hooks_dict)
 
diff --git a/torch/utils/model_zoo.py b/torch/utils/model_zoo.py
index 791ed3ed74..ddc16b4b9a 100644
--- a/torch/utils/model_zoo.py
+++ b/torch/utils/model_zoo.py
@@ -92,6 +92,7 @@ def _download_url_to_file(url, dst, hash_prefix):
 if tqdm is None:
     # fake tqdm if it's not installed
     class tqdm(object):
+
         def __init__(self, total):
             self.total = total
             self.n = 0
diff --git a/torch/utils/serialization/read_lua_file.py b/torch/utils/serialization/read_lua_file.py
index 4d654a0072..1466d32e0b 100644
--- a/torch/utils/serialization/read_lua_file.py
+++ b/torch/utils/serialization/read_lua_file.py
@@ -130,6 +130,7 @@ def get_python_class(typename):
 
 def make_tensor_reader(typename):
     python_class = get_python_class(typename)
+
     def read_tensor(reader, version):
         # source:
         # https://github.com/torch/torch7/blob/master/generic/Tensor.c#L1243
@@ -156,6 +157,7 @@ def make_storage_reader(typename):
     python_class = get_python_class(typename)
     # TODO: be smarter about this
     element_size = python_class().element_size()
+
     def read_storage(reader, version):
         # source:
         # https://github.com/torch/torch7/blob/master/generic/Storage.c#L244
@@ -185,6 +187,7 @@ register_torch_class('Tensor', make_tensor_reader)
 # Reader function for tds.Vector and tds.Hash
 ################################################################################
 
+
 def tds_Vec_reader(reader, version):
     length = reader.read_long()
     return [reader.read() for i in range(length)]
@@ -207,6 +210,7 @@ reader_registry['tds.Hash'] = tds_Hash_reader
 # Reader function for nn modules
 ################################################################################
 
+
 def _load_backend(obj):
     if hasattr(obj, '_type'):
         obj._backend = type2backend[obj._type]
@@ -221,6 +225,7 @@ def _load_backend(obj):
                 pass
     # Monkey patch the forward to capture the type of input
     updateOutput_orig = obj.updateOutput
+
     def updateOutput_patch(*args):
         input = args[0]
         while not torch.is_tensor(input):
@@ -242,13 +247,14 @@ def nn_reader(cls):
 
 
 reader_registry.update({('nn.' + name): nn_reader(module)
-    for name, module in nn.__dict__.items()
-    if name[0] != '_' and name[0].upper() == name[0]})
+                        for name, module in nn.__dict__.items()
+                        if name[0] != '_' and name[0].upper() == name[0]})
 
 
 def custom_reader(cls):
     def reader_factory(fn):
         base = nn_reader(cls)
+
         def wrapper(reader, version):
             obj = base(reader, version)
             fn(reader, version, obj)
@@ -271,7 +277,7 @@ for prefix in ['', 'Spatial', 'Volumetric']:
 @custom_reader(nn.Transpose)
 def Transpose_reader(reader, version, obj):
     obj.permutations = list(
-            map(lambda swap: [swap[0]-1, swap[1]-1], obj.permutations))
+        map(lambda swap: [swap[0] - 1, swap[1] - 1], obj.permutations))
 
 
 @custom_reader(nn.SpatialDivisiveNormalization)
@@ -299,6 +305,7 @@ def registry_addon(fn):
     def wrapper_factory(module_name, *args, **kwargs):
         module_name = 'nn.' + module_name
         build_fn = reader_registry[module_name]
+
         def wrapper(reader, version):
             obj = build_fn(reader, version)
             fn(obj, *args, **kwargs)
@@ -306,6 +313,7 @@ def registry_addon(fn):
         reader_registry[module_name] = wrapper
     return wrapper_factory
 
+
 @registry_addon
 def attr_map(obj, attribute_map):
     for src, dst in attribute_map.items():
@@ -521,9 +529,9 @@ class T7Reader:
         if self.unknown_classes:
             return TorchObject(cls_name, self.read())
         raise T7ReaderException(("don't know how to deserialize Lua class "
-                "{}. If you want to ignore this error and load this object "
-                "as a dict, specify unknown_classes=True in reader's "
-                "constructor").format(cls_name))
+                                 "{}. If you want to ignore this error and load this object "
+                                 "as a dict, specify unknown_classes=True in reader's "
+                                 "constructor").format(cls_name))
 
     def _can_be_list(self, table):
         def is_natural(key):
@@ -546,7 +554,7 @@ class T7Reader:
             v = self.read()
             table[k] = v
         if self.list_heuristic and self._can_be_list(table):
-            return [table[i] for i in range(1, len(table)+1)]
+            return [table[i] for i in range(1, len(table) + 1)]
         return table
 
     def read(self):
@@ -569,7 +577,7 @@ class T7Reader:
             return self.read_table()
         else:
             raise T7ReaderException("unknown type id {}. The file may be "
-                    "corrupted.".format(typeidx))
+                                    "corrupted.".format(typeidx))
 
 
 def load_lua(filename, **kwargs):
@@ -580,4 +588,3 @@ def load_lua(filename, **kwargs):
     with open(filename, 'rb') as f:
         reader = T7Reader(f, **kwargs)
         return reader.read()
-
diff --git a/torch/utils/trainer/plugins/__init__.py b/torch/utils/trainer/plugins/__init__.py
index 4258e05ebe..e8d10f48ae 100644
--- a/torch/utils/trainer/plugins/__init__.py
+++ b/torch/utils/trainer/plugins/__init__.py
@@ -3,4 +3,3 @@ from .accuracy import AccuracyMonitor
 from .time import TimeMonitor
 from .loss import LossMonitor
 from .logger import Logger
-
diff --git a/torch/utils/trainer/plugins/accuracy.py b/torch/utils/trainer/plugins/accuracy.py
index c1431e0817..f6f393c16d 100644
--- a/torch/utils/trainer/plugins/accuracy.py
+++ b/torch/utils/trainer/plugins/accuracy.py
@@ -1,5 +1,6 @@
 from .monitor import Monitor
 
+
 class AccuracyMonitor(Monitor):
     stat_name = 'accuracy'
 
@@ -16,4 +17,3 @@ class AccuracyMonitor(Monitor):
             correct = correct.cpu()
         correct = correct.sum()
         return 100. * correct / batch_size
-
diff --git a/torch/utils/trainer/plugins/logger.py b/torch/utils/trainer/plugins/logger.py
index f132a11e54..9bc2dfc6a4 100644
--- a/torch/utils/trainer/plugins/logger.py
+++ b/torch/utils/trainer/plugins/logger.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 from .plugin import Plugin
 
+
 class Logger(Plugin):
     alignment = 4
     separator = '#' * 80
@@ -58,7 +59,7 @@ class Logger(Plugin):
             for f in field:
                 parent, stat = stat, stat[f]
             name, output = self._gather_outputs(field, log_fields,
-                    parent, stat, require_dict)
+                                                parent, stat, require_dict)
             if not output:
                 continue
             self._align_output(field_idx, output)
@@ -77,7 +78,6 @@ class Logger(Plugin):
 
     def epoch(self, epoch_idx):
         self._log_all('log_epoch_fields',
-                prefix=self.separator + '\nEpoch summary:',
-                suffix=self.separator,
-                require_dict=True)
-
+                      prefix=self.separator + '\nEpoch summary:',
+                      suffix=self.separator,
+                      require_dict=True)
diff --git a/torch/utils/trainer/plugins/loss.py b/torch/utils/trainer/plugins/loss.py
index 320158c856..eea44ca81f 100644
--- a/torch/utils/trainer/plugins/loss.py
+++ b/torch/utils/trainer/plugins/loss.py
@@ -1,8 +1,8 @@
 from .monitor import Monitor
 
+
 class LossMonitor(Monitor):
     stat_name = 'loss'
 
     def _get_value(self, iteration, input, target, output, loss):
         return loss[0]
-
diff --git a/torch/utils/trainer/plugins/monitor.py b/torch/utils/trainer/plugins/monitor.py
index 80bdf37e75..cb8da2e6e7 100644
--- a/torch/utils/trainer/plugins/monitor.py
+++ b/torch/utils/trainer/plugins/monitor.py
@@ -41,7 +41,7 @@ class Monitor(Plugin):
 
         if self.with_epoch_average:
             stats['epoch_stats'] = tuple(sum(t) for t in
-                    zip(stats['epoch_stats'], (stats['last'], 1)))
+                                         zip(stats['epoch_stats'], (stats['last'], 1)))
 
         if self.with_running_average:
             previous_avg = stats.get('running_avg', 0)
@@ -54,4 +54,3 @@ class Monitor(Plugin):
             epoch_stats = stats['epoch_stats']
             stats['epoch_mean'] = epoch_stats[0] / epoch_stats[1]
             stats['epoch_stats'] = (0, 0)
-
diff --git a/torch/utils/trainer/plugins/plugin.py b/torch/utils/trainer/plugins/plugin.py
index 145c6b93d1..e1ac25101f 100644
--- a/torch/utils/trainer/plugins/plugin.py
+++ b/torch/utils/trainer/plugins/plugin.py
@@ -8,4 +8,3 @@ class Plugin(object):
 
     def register(self, trainer):
         raise NotImplementedError
-
diff --git a/torch/utils/trainer/plugins/progress.py b/torch/utils/trainer/plugins/progress.py
index 582f087eb5..06a3c3f92d 100644
--- a/torch/utils/trainer/plugins/progress.py
+++ b/torch/utils/trainer/plugins/progress.py
@@ -26,4 +26,3 @@ class ProgressMonitor(Plugin):
         stats = self.trainer.stats.setdefault(self.stat_name, {})
         stats['samples_used'] = 0
         stats['percent'] = 0
-
diff --git a/torch/utils/trainer/plugins/time.py b/torch/utils/trainer/plugins/time.py
index 8b79fa3e82..ffdc1988d5 100644
--- a/torch/utils/trainer/plugins/time.py
+++ b/torch/utils/trainer/plugins/time.py
@@ -22,4 +22,3 @@ class TimeMonitor(Monitor):
         else:
             self.last_time = time.time()
             return 0
-
diff --git a/torch/utils/trainer/trainer.py b/torch/utils/trainer/trainer.py
index d5157b7b06..9cdf5643c5 100644
--- a/torch/utils/trainer/trainer.py
+++ b/torch/utils/trainer/trainer.py
@@ -58,6 +58,7 @@ class Trainer(object):
             target_var = Variable(batch_target)
 
             plugin_data = [None, None]
+
             def closure():
                 batch_output = self.model(input_var)
                 loss = self.criterion(batch_output, target_var)
@@ -70,7 +71,7 @@ class Trainer(object):
             self.optimizer.zero_grad()
             self.optimizer.step(closure)
             self.call_plugins('iteration', i, batch_input, batch_target,
-                                *plugin_data)
+                              *plugin_data)
             self.call_plugins('update', i, self.model)
 
         self.iterations += i