Unverified Commit 7d7edf6d authored by Anupam Bhatnagar's avatar Anupam Bhatnagar Committed by GitHub
Browse files

Setup pre-commit github action and apply pre-commit to all files (#849)

* adding pre-commit files

* applying pre-commit to all files

* adding no-strict-optional argument to mypy in circle ci config

* fix typo

* updating python versions

* [skip ci] remove extra args

* adding python 3.9

* [skip ci] set pre-commit version in requirements-dev.txt

* set CACHE_VERSION

* move linters from circleci to github actions

* update python version

* update python version in benchmarks_2

* moving to python 3.9.7
parent 6f3931a4
......@@ -32,7 +32,12 @@ def test_simple_linears():
p.grad = None
inputs = torch.rand(8, 1)
model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
model = nn.Sequential(
nn.Linear(1, 2),
nn.Linear(2, 4),
nn.Linear(4, 2),
nn.Linear(2, 1),
)
# Without Pipe
outputs = model(inputs)
......
......@@ -109,7 +109,13 @@ def mpi():
def public_attrs(pipe_class):
model = nn.Sequential(nn.Linear(1, 1))
pipe = pipe_class(model, balance=(1,), worker_map=get_worker_map(), chunks=42, checkpoint="always",)
pipe = pipe_class(
model,
balance=(1,),
worker_map=get_worker_map(),
chunks=42,
checkpoint="always",
)
assert pipe.balance == [1]
assert pipe.chunks == 42
......@@ -257,9 +263,27 @@ def checkpoint_mode(pipe_class):
model = nn.Sequential(nn.Linear(1, 1))
input = torch.rand(2, 1)
always = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="always",)
except_last = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="except_last",)
never = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="never",)
always = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
checkpoint="always",
)
except_last = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
checkpoint="except_last",
)
never = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
checkpoint="never",
)
always_output = always(input)
except_last_output = except_last(input)
......@@ -277,7 +301,11 @@ def checkpoint_mode_invalid(pipe_class):
with pytest.raises(ValueError, match="checkpoint is not one of 'always', 'except_last', or 'never'"):
pipe_class(
model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint="INVALID_CHECKPOINT",
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
checkpoint="INVALID_CHECKPOINT",
)
......@@ -288,7 +316,11 @@ def checkpoint_mode_when_chunks_1(pipe_class):
# All checkpoint modes are fine.
pipe_class(
model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="except_last",
model,
balance=[1],
worker_map=get_worker_map(),
chunks=1,
checkpoint="except_last",
)
pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="always")
pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint="never")
......@@ -298,7 +330,12 @@ def checkpoint_mode_when_chunks_1(pipe_class):
@pytest.mark.parametrize("pipe_class", [AsyncPipe])
def checkpoint_eval(pipe_class):
model = nn.Sequential(nn.Linear(1, 1))
model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
model = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
)
input = torch.rand(2, 1)
def find_grad_fn(grad_fn, name):
......@@ -335,7 +372,13 @@ def checkpoint_non_float_input(pipe_class):
return input[0] * 2
model = nn.Sequential(ForkNonFloat(), JoinNonFloat())
model = pipe_class(model, balance=[1, 1], worker_map=get_worker_map(), chunks=1, checkpoint="always",)
model = pipe_class(
model,
balance=[1, 1],
worker_map=get_worker_map(),
chunks=1,
checkpoint="always",
)
input = torch.rand(1, requires_grad=True)
output = model(input)
......@@ -444,7 +487,12 @@ def input_pair(pipe_class):
return (self.fc_a(a), self.fc_b(b))
model = nn.Sequential(Two())
model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
model = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
)
a = torch.rand(10, 1, requires_grad=True)
b = torch.rand(10, 1, requires_grad=True)
......@@ -470,7 +518,12 @@ def input_singleton(pipe_class):
return (self.fc(a),)
model = nn.Sequential(One())
model = pipe_class(model, balance=[1], worker_map=get_worker_map(), chunks=2,)
model = pipe_class(
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
)
a = torch.rand(10, 1, requires_grad=True)
......@@ -549,7 +602,12 @@ def deferred_batch_norm(checkpoint, lazy, pipe_class):
else:
model = nn.Sequential(pipe_bn)
pipe = pipe_class(
model, balance=[1], worker_map=get_worker_map(), chunks=2, checkpoint=checkpoint, deferred_batch_norm=True,
model,
balance=[1],
worker_map=get_worker_map(),
chunks=2,
checkpoint=checkpoint,
deferred_batch_norm=True,
)
x = torch.rand(4, 3, 10, 10)
......@@ -573,7 +631,12 @@ def deferred_batch_norm_params(checkpoint, lazy, pipe_class):
else:
model = nn.Sequential(pipe_bn)
pipe = pipe_class(
model, balance=[1], worker_map=get_worker_map(), chunks=1, checkpoint=checkpoint, deferred_batch_norm=True,
model,
balance=[1],
worker_map=get_worker_map(),
chunks=1,
checkpoint=checkpoint,
deferred_batch_norm=True,
)
x = torch.rand(4, 3, 10, 10)
......
......@@ -72,7 +72,11 @@ def check_pipe_against_reference(balance, model_constructor, checkpoint="except_
reference_model = nn.Sequential(*reference_model).cuda()
pipe = PipeRPCWrapper(
model, balance, input_device=torch.cuda.current_device(), worker_map=get_worker_map(), checkpoint=checkpoint,
model,
balance,
input_device=torch.cuda.current_device(),
worker_map=get_worker_map(),
checkpoint=checkpoint,
)
pipe.foreach_worker(register_optimizer, include_self=True)
......@@ -118,7 +122,8 @@ def rpc_optimizer():
return [reused_1, nn.ReLU(), reused_1, nn.ReLU(), reused_1, nn.ReLU()]
check_pipe_against_reference(
[2, 2, 2], lambda: [nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()],
[2, 2, 2],
lambda: [nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU()],
)
check_pipe_against_reference([2, 1, 1], model_with_reuse)
......
......@@ -38,7 +38,12 @@ def simple_linears(pipe_class):
set_random_seed(12345)
inputs = torch.rand(8, 1)
model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)
model = nn.Sequential(
nn.Linear(1, 2),
nn.Linear(2, 4),
nn.Linear(4, 2),
nn.Linear(2, 1),
)
# Without MultiProcessPipe
outputs = model(inputs)
......
......@@ -473,7 +473,10 @@ def test_collect_shards():
reference_rank = 0
mp.spawn(
run_test_collect_shards, args=(world_size, reference_rank, temp_file_name), nprocs=world_size, join=True,
run_test_collect_shards,
args=(world_size, reference_rank, temp_file_name),
nprocs=world_size,
join=True,
)
......@@ -537,7 +540,10 @@ def test_reproducibility(broadcast_fp16: bool):
temp_file_name = tempfile.mkstemp()[1]
mp.spawn(
run_test_reproducibility, args=(world_size, temp_file_name, broadcast_fp16), nprocs=world_size, join=True,
run_test_reproducibility,
args=(world_size, temp_file_name, broadcast_fp16),
nprocs=world_size,
join=True,
)
......@@ -618,7 +624,10 @@ def test_multiple_groups():
temp_file_name = tempfile.mkstemp()[1]
mp.spawn(
run_test_multiple_groups, args=(world_size, temp_file_name), nprocs=world_size, join=True,
run_test_multiple_groups,
args=(world_size, temp_file_name),
nprocs=world_size,
join=True,
)
......@@ -646,10 +655,16 @@ def run_gradient_clipping(rank, world_size, tempfile_name):
# Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the
# gradient norm computation from OSS and adds a dependency.
# to keep the comparison apples-to-apples DDP is used in both cases
model_oss = DDP(module=model_oss, device_ids=[rank],)
model_oss = DDP(
module=model_oss,
device_ids=[rank],
)
sharded_optimizer = optim.OSS(model_oss.parameters(), lr=0.1, momentum=0.99)
model = DDP(model, device_ids=[rank],)
model = DDP(
model,
device_ids=[rank],
)
loss_fn = torch.nn.L1Loss()
loss_fn.to(device)
......@@ -697,7 +712,10 @@ def test_gradient_clipping():
reference_rank = 0
mp.spawn(
run_gradient_clipping, args=(world_size, temp_file_name), nprocs=world_size, join=True,
run_gradient_clipping,
args=(world_size, temp_file_name),
nprocs=world_size,
join=True,
)
......@@ -723,11 +741,17 @@ def run_state_dict_distributed(rank, world_size, tempfile_name):
# Normally OSS would use ShardedDDP and only reduce to the proper rank, but this does not change the
# gradient norm computation from OSS and adds a dependency.
# to keep the comparison apples-to-apples DDP is used in both cases
model_oss1 = DDP(module=model_oss1, device_ids=[rank],)
model_oss1 = DDP(
module=model_oss1,
device_ids=[rank],
)
sharded_optimizer1 = optim.OSS(model_oss1.parameters(), lr=0.1, momentum=0.99)
sharded_optimizer1.add_param_group({"params": head_oss1.parameters()})
model_oss2 = DDP(module=model_oss2, device_ids=[rank],)
model_oss2 = DDP(
module=model_oss2,
device_ids=[rank],
)
sharded_optimizer2 = optim.OSS(model_oss2.parameters(), lr=0.1, momentum=0.99)
sharded_optimizer2.add_param_group({"params": head_oss2.parameters()})
......@@ -804,7 +828,10 @@ def test_state_dict_distributed():
world_size = max(world_size, torch.cuda.device_count())
mp.spawn(
run_state_dict_distributed, args=(world_size, temp_file_name), nprocs=world_size, join=True,
run_state_dict_distributed,
args=(world_size, temp_file_name),
nprocs=world_size,
join=True,
)
......
......@@ -18,7 +18,10 @@ from fairscale.utils.testing import dist_init, spawn_for_all_world_sizes
def rename_test(testcase_func, param_num, param):
return "%s_%s" % (testcase_func.__name__, parameterized.to_safe_name(str(param.args)),)
return "%s_%s" % (
testcase_func.__name__,
parameterized.to_safe_name(str(param.args)),
)
CONFIG_OPTIONS = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment