Setup pre-commit github action and apply pre-commit to all files (#849)

* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7

Setup pre-commit github action and apply pre-commit to all files (#849)
* adding pre-commit files * applying pre-commit to all files * adding no-strict-optional argument to mypy in circle ci config * fix typo * updating python versions * [skip ci] remove extra args * adding python 3.9 * [skip ci] set pre-commit version in requirements-dev.txt * set CACHE_VERSION * move linters from circleci to github actions * update python version * update python version in benchmarks_2 * moving to python 3.9.7
7d7edf6d · Anupam Bhatnagar · GitHub · 6f3931a4 · 7d7edf6d · 7d7edf6d
Unverified Commit 7d7edf6d authored Nov 11, 2021 by Anupam Bhatnagar Committed by GitHub Nov 11, 2021
6 changed files
--- a/tests/nn/pipe/test_transparency.py
+++ b/tests/nn/pipe/test_transparency.py
@@ -32,7 +32,12 @@ def test_simple_linears():
            p.grad = None

    inputs = torch.rand(8, 1)
-    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
+    model = nn.Sequential(
+        nn.Linear(1, 2),
+        nn.Linear(2, 4),
+        nn.Linear(4, 2),
+        nn.Linear(2, 1),
+    )

    # Without Pipe
    outputs = model(inputs)

--- a/tests/nn/pipe_process/test_pipe.py
+++ b/tests/nn/pipe_process/test_pipe.py
@@ -109,7 +109,13 @@ def mpi():
 def public_attrs(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1))

-    pipe = pipe_class(model, balance=(1,), worker_map=get_worker_map(), chunks=42, checkpoint="always",)
+    pipe = pipe_class(
+        model,
+        balance=(1,),
+        worker_map=get_worker_map(),
+        chunks=42,
+        checkpoint="always",
+    )

    assert pipe.balance == [1]
    assert pipe.chunks == 42
@@ -257,9 +263,27 @@ def checkpoint_mode(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1))
    input = torch.rand(2, 1)

-    always = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="always",)
-    except_last = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="except_last",)
-    never = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="never",)
+    always = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+        checkpoint="always",
+    )
+    except_last = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+        checkpoint="except_last",
+    )
+    never = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+        checkpoint="never",
+    )

    always_output = always(input)
    except_last_output = except_last(input)
@@ -277,7 +301,11 @@ def checkpoint_mode_invalid(pipe_class):

    with pytest.raises(ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"):
        pipe_class(
-            model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="INVALID_CHECKPOINT",
+            model,
+            balance=[1],
+            worker_map=get_worker_map(),
+            chunks=2,
+            checkpoint="INVALID_CHECKPOINT",
        )


@@ -288,7 +316,11 @@ def checkpoint_mode_when_chunks_1(pipe_class):

    # All checkpoint modes are fine.
    pipe_class(
-        model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="except_last",
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=1,
+        checkpoint="except_last",
    )
    pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="always")
    pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="never")
@@ -298,7 +330,12 @@ def checkpoint_mode_when_chunks_1(pipe_class):
 @pytest.mark.parametrize("pipe_class", [AsyncPipe])
 def checkpoint_eval(pipe_class):
    model = nn.Sequential(nn.Linear(1, 1))
-    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
+    model = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+    )
    input = torch.rand(2, 1)

    def find_grad_fn(grad_fn, name):
@@ -335,7 +372,13 @@ def checkpoint_non_float_input(pipe_class):
            return input[0] * 2

    model = nn.Sequential(ForkNonFloat(), JoinNonFloat())
-    model = pipe_class(model, balance=[1, 1], worker_map=get_worker_map(), chunks=1, checkpoint="always",)
+    model = pipe_class(
+        model,
+        balance=[1, 1],
+        worker_map=get_worker_map(),
+        chunks=1,
+        checkpoint="always",
+    )

    input = torch.rand(1, requires_grad=True)
    output = model(input)
@@ -444,7 +487,12 @@ def input_pair(pipe_class):
            return (self.fc_a(a), self.fc_b(b))

    model = nn.Sequential(Two())
-    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
+    model = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+    )

    a = torch.rand(10, 1, requires_grad=True)
    b = torch.rand(10, 1, requires_grad=True)
@@ -470,7 +518,12 @@ def input_singleton(pipe_class):
            return (self.fc(a),)

    model = nn.Sequential(One())
-    model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
+    model = pipe_class(
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+    )

    a = torch.rand(10, 1, requires_grad=True)

@@ -549,7 +602,12 @@ def deferred_batch_norm(checkpoint, lazy, pipe_class):
    else:
        model = nn.Sequential(pipe_bn)
    pipe = pipe_class(
-        model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint, deferred_batch_norm=True,
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=2,
+        checkpoint=checkpoint,
+        deferred_batch_norm=True,
    )

    x = torch.rand(4, 3, 10, 10)
@@ -573,7 +631,12 @@ def deferred_batch_norm_params(checkpoint, lazy, pipe_class):
    else:
        model = nn.Sequential(pipe_bn)
    pipe = pipe_class(
-        model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint=checkpoint, deferred_batch_norm=True,
+        model,
+        balance=[1],
+        worker_map=get_worker_map(),
+        chunks=1,
+        checkpoint=checkpoint,
+        deferred_batch_norm=True,
    )

    x = torch.rand(4, 3, 10, 10)

--- a/tests/nn/pipe_process/test_rpc.py
+++ b/tests/nn/pipe_process/test_rpc.py
@@ -72,7 +72,11 @@ def check_pipe_against_reference(balance, model_constructor, checkpoint="except_
    reference_model = nn.Sequential(*reference_model).cuda()

    pipe = PipeRPCWrapper(
-        model, balance, input_device=torch.cuda.current_device(), worker_map=get_worker_map(), checkpoint=checkpoint,
+        model,
+        balance,
+        input_device=torch.cuda.current_device(),
+        worker_map=get_worker_map(),
+        checkpoint=checkpoint,
    )

    pipe.foreach_worker(register_optimizer, include_self=True)
@@ -118,7 +122,8 @@ def rpc_optimizer():
        return [reused_1, nn.ReLU(), reused_1, nn.ReLU(), reused_1, nn.ReLU()]

    check_pipe_against_reference(
-        [2, 2, 2], lambda: [nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()],
+        [2, 2, 2],
+        lambda: [nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()],
    )
    check_pipe_against_reference([2, 1, 1], model_with_reuse)


--- a/tests/nn/pipe_process/test_transparency.py
+++ b/tests/nn/pipe_process/test_transparency.py
@@ -38,7 +38,12 @@ def simple_linears(pipe_class):

    set_random_seed(12345)
    inputs = torch.rand(8, 1)
-    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
+    model = nn.Sequential(
+        nn.Linear(1, 2),
+        nn.Linear(2, 4),
+        nn.Linear(4, 2),
+        nn.Linear(2, 1),
+    )

    # Without MultiProcessPipe
    outputs = model(inputs)

--- a/tests/optim/test_oss.py
+++ b/tests/optim/test_oss.py
@@ -473,7 +473,10 @@ def test_collect_shards():
    reference_rank = 0

    mp.spawn(
-        run_test_collect_shards, args=(world_size, reference_rank, temp_file_name), nprocs=world_size, join=True,
+        run_test_collect_shards,
+        args=(world_size, reference_rank, temp_file_name),
+        nprocs=world_size,
+        join=True,
    )


@@ -537,7 +540,10 @@ def test_reproducibility(broadcast_fp16: bool):
    temp_file_name = tempfile.mkstemp()[1]

    mp.spawn(
-        run_test_reproducibility, args=(world_size, temp_file_name, broadcast_fp16), nprocs=world_size, join=True,
+        run_test_reproducibility,
+        args=(world_size, temp_file_name, broadcast_fp16),
+        nprocs=world_size,
+        join=True,
    )


@@ -618,7 +624,10 @@ def test_multiple_groups():
    temp_file_name = tempfile.mkstemp()[1]

    mp.spawn(
-        run_test_multiple_groups, args=(world_size, temp_file_name), nprocs=world_size, join=True,
+        run_test_multiple_groups,
+        args=(world_size, temp_file_name),
+        nprocs=world_size,
+        join=True,
    )


@@ -646,10 +655,16 @@ def run_gradient_clipping(rank, world_size, tempfile_name):
        # Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the
        # gradient norm computation from OSS and adds a dependency.
        # to keep the comparison apples-to-apples DDP is used in both cases
-        model_oss = DDP(module=model_oss, device_ids=[rank],)
+        model_oss = DDP(
+            module=model_oss,
+            device_ids=[rank],
+        )
        sharded_optimizer = optim.OSS(model_oss.parameters(), lr=0.1, momentum=0.99)

-        model = DDP(model, device_ids=[rank],)
+        model = DDP(
+            model,
+            device_ids=[rank],
+        )

        loss_fn = torch.nn.L1Loss()
        loss_fn.to(device)
@@ -697,7 +712,10 @@ def test_gradient_clipping():
    reference_rank = 0

    mp.spawn(
-        run_gradient_clipping, args=(world_size, temp_file_name), nprocs=world_size, join=True,
+        run_gradient_clipping,
+        args=(world_size, temp_file_name),
+        nprocs=world_size,
+        join=True,
    )


@@ -723,11 +741,17 @@ def run_state_dict_distributed(rank, world_size, tempfile_name):
    # Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the
    # gradient norm computation from OSS and adds a dependency.
    # to keep the comparison apples-to-apples DDP is used in both cases
-    model_oss1 = DDP(module=model_oss1, device_ids=[rank],)
+    model_oss1 = DDP(
+        module=model_oss1,
+        device_ids=[rank],
+    )
    sharded_optimizer1 = optim.OSS(model_oss1.parameters(), lr=0.1, momentum=0.99)
    sharded_optimizer1.add_param_group({"params": head_oss1.parameters()})

-    model_oss2 = DDP(module=model_oss2, device_ids=[rank],)
+    model_oss2 = DDP(
+        module=model_oss2,
+        device_ids=[rank],
+    )
    sharded_optimizer2 = optim.OSS(model_oss2.parameters(), lr=0.1, momentum=0.99)
    sharded_optimizer2.add_param_group({"params": head_oss2.parameters()})

@@ -804,7 +828,10 @@ def test_state_dict_distributed():
        world_size = max(world_size, torch.cuda.device_count())

    mp.spawn(
-        run_state_dict_distributed, args=(world_size, temp_file_name), nprocs=world_size, join=True,
+        run_state_dict_distributed,
+        args=(world_size, temp_file_name),
+        nprocs=world_size,
+        join=True,
    )



--- a/tests/utils/test_reduce_scatter_bucketer.py
+++ b/tests/utils/test_reduce_scatter_bucketer.py
@@ -18,7 +18,10 @@ from fairscale.utils.testing import dist_init, spawn_for_all_world_sizes


 def rename_test(testcase_func, param_num, param):
-    return "%s_%s" % (testcase_func.__name__, parameterized.to_safe_name(str(param.args)),)
+    return "%s_%s" % (
+        testcase_func.__name__,
+        parameterized.to_safe_name(str(param.args)),
+    )


 CONFIG_OPTIONS = [