Unverified Commit 0e8c2a96 authored by Jun Ru Anderson's avatar Jun Ru Anderson Committed by GitHub
Browse files

[test] set torch seed for Adam tests (#49)



Set the torch seed for tests. xfail mixed precision and memory-efficient mixed-precision state_dict tests due to their states being cast to FP16 and back to FP32 during load_state_dict.
Co-authored-by: default avatarJun Ru Anderson <andersonic@fb.com>
parent c2d6f4b6
...@@ -135,7 +135,11 @@ def make_model(device, ntokens): ...@@ -135,7 +135,11 @@ def make_model(device, ntokens):
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
lr = 0.01 # learning rate lr = 0.01 # learning rate
optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION)
try:
optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION)
except NameError:
optimizer = Adam(p.parameters(), lr=lr)
return p, criterion, optimizer return p, criterion, optimizer
......
...@@ -147,6 +147,10 @@ try: ...@@ -147,6 +147,10 @@ try:
def load_state_dict(self, state_dict: Dict[str, Any]) -> None: def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
super().load_state_dict(state_dict) super().load_state_dict(state_dict)
# TODO: Optimizer state gets cast to FP16 and back to FP32 for
# mixed-precision and memory-efficient mixed-precision. Eventually
# we want to fix this, as some precision may be lost
for group in self.param_groups: for group in self.param_groups:
for p in group["params"]: for p in group["params"]:
self.state[p]["exp_avg"] = self.state[p]["exp_avg"].type(self.optim_type) self.state[p]["exp_avg"] = self.state[p]["exp_avg"].type(self.optim_type)
......
...@@ -20,6 +20,12 @@ skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda ...@@ -20,6 +20,12 @@ skip_if_no_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda
skip_if_no_adam = pytest.mark.skipif(not imported_adam, reason="Fairscale Adam not available") skip_if_no_adam = pytest.mark.skipif(not imported_adam, reason="Fairscale Adam not available")
@pytest.fixture(autouse=True)
def set_torch_seed():
torch.manual_seed(1)
yield
def make_full_precision_params(): def make_full_precision_params():
weight = torch.randn(2, 1).cuda().requires_grad_() weight = torch.randn(2, 1).cuda().requires_grad_()
bias = torch.randn(2).cuda().requires_grad_() bias = torch.randn(2).cuda().requires_grad_()
...@@ -75,12 +81,26 @@ def state_dict_test(optimizer, weight, bias, input): ...@@ -75,12 +81,26 @@ def state_dict_test(optimizer, weight, bias, input):
# Load state dict # Load state dict
state_dict = deepcopy(optimizer.state_dict()) state_dict = deepcopy(optimizer.state_dict())
optimizer_c.load_state_dict(state_dict) optimizer_c.load_state_dict(state_dict)
for group, group_c in zip(optimizer.param_groups, optimizer_c.param_groups):
for p, p_c in zip(group["params"], group_c["params"]):
assert torch.equal(optimizer.state[p]["exp_avg"], optimizer_c.state[p_c]["exp_avg"])
assert torch.equal(optimizer.state[p]["exp_avg_sq"], optimizer_c.state[p_c]["exp_avg_sq"])
if optimizer.fp32_param_groups:
# When using mixed precision, fp32_param_groups are made from FP16 params rather than
# copied via state_dict, introducing differences between the original optimizer and
# the copy. Because this test requires that they be the exact same, we copy the
# fp32 params from the original optimizer to the copy
optimizer_c.fp32_param_groups = deepcopy(optimizer.fp32_param_groups)
# Run both optimizations in parallel # Run both optimizations in parallel
for _i in range(5): for _i in range(5):
optimizer.step(fn) optimizer.step(fn)
optimizer_c.step(fn_c) optimizer_c.step(fn_c)
(weight - weight_c).to("cpu").detach().apply_(assert_almost_zero)
(bias - bias_c).to("cpu").detach().apply_(assert_almost_zero) assert torch.equal(weight, weight_c)
assert torch.equal(bias, bias_c)
def assert_almost_zero(x): def assert_almost_zero(x):
...@@ -230,7 +250,12 @@ def test_state_dict_full_precision(): ...@@ -230,7 +250,12 @@ def test_state_dict_full_precision():
@skip_if_no_cuda @skip_if_no_cuda
@skip_if_no_adam @skip_if_no_adam
@pytest.mark.xfail
def test_state_dict_mixed_precision(): def test_state_dict_mixed_precision():
# TODO: Optimizer state gets cast to FP16 and back to FP32 for
# mixed-precision and memory-efficient mixed-precision, resulting
# in a potential loss of precision. Thus, as training proceeds, we don't
# necessarily expect the parameters to remain the exact same.
weight, bias, input = make_half_precision_params() weight, bias, input = make_half_precision_params()
optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MIXED_PRECISION) optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MIXED_PRECISION)
...@@ -239,7 +264,12 @@ def test_state_dict_mixed_precision(): ...@@ -239,7 +264,12 @@ def test_state_dict_mixed_precision():
@skip_if_no_cuda @skip_if_no_cuda
@skip_if_no_adam @skip_if_no_adam
@pytest.mark.xfail
def test_state_dict_memory_efficient(): def test_state_dict_memory_efficient():
# TODO: Optimizer state gets cast to FP16 and back to FP32 for
# mixed-precision and memory-efficient mixed-precision, resulting
# in a potential loss of precision. Thus, as training proceeds, we don't
# necessarily expect the parameters to remain the exact same.
weight, bias, input = make_half_precision_params() weight, bias, input = make_half_precision_params()
optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION) optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment