Commit 93f3a3bc authored by Hubert Lu's avatar Hubert Lu
Browse files

Revert back to the test_fused_optimizer.py in upstream to solve multiple unit test errors

parent 203e3231
...@@ -29,10 +29,7 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -29,10 +29,7 @@ class TestFusedOptimizer(unittest.TestCase):
ref_param = [] ref_param = []
tst_param = [] tst_param = []
for tensor in tensors: for tensor in tensors:
if apex_only: ref_param.append(torch.nn.Parameter(tensor.clone()))
ref_param.append(torch.nn.Parameter(tensor.clone().float()))
else:
ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone())) tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = self.ref_optim(ref_param, **options) ref_optim = self.ref_optim(ref_param, **options)
...@@ -40,10 +37,10 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -40,10 +37,10 @@ class TestFusedOptimizer(unittest.TestCase):
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param, apex_only=False): def gen_grad(self, ref_param, tst_param):
for p_ref, p_tst in zip(ref_param, tst_param): for p_ref, p_tst in zip(ref_param, tst_param):
p_tst.grad = torch.rand_like(p_tst) p_ref.grad = torch.rand_like(p_ref)
p_ref.grad = p_tst.grad.detach().float() if apex_only else p_tst.grad p_tst.grad = p_ref.grad
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0): def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
half_grads = [] half_grads = []
...@@ -52,11 +49,9 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -52,11 +49,9 @@ class TestFusedOptimizer(unittest.TestCase):
p_ref.grad = half_grads[-1].float() / scale p_ref.grad = half_grads[-1].float() / scale
return half_grads return half_grads
def get_max_diff(self, ref_param, tst_param, apex_only=False): def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0 max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param): for p_ref, p_tst in zip(ref_param, tst_param):
if apex_only:
p_tst = p_tst.float()
max_abs_diff_p = (p_ref - p_tst).abs().max().item() max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item() max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
...@@ -65,7 +60,7 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -65,7 +60,7 @@ class TestFusedOptimizer(unittest.TestCase):
return max_abs_diff, max_rel_diff return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, apex_only=False, device='cuda'): def gen_single_type_test(self, param_type=torch.float, device='cuda'):
nelem = 278011 nelem = 278011
# Some ref and test optimizers may require different set of options. # Some ref and test optimizers may require different set of options.
...@@ -82,13 +77,12 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -82,13 +77,12 @@ class TestFusedOptimizer(unittest.TestCase):
self.gen_param_optim([tensor], self.options, self.tst_options) self.gen_param_optim([tensor], self.options, self.tst_options)
for i in range(self.iters): for i in range(self.iters):
self.gen_grad(ref_param, tst_param, apex_only=apex_only) self.gen_grad(ref_param, tst_param)
ref_optim.step() ref_optim.step()
tst_optim.step() tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only) max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
if not apex_only: self.assertLessEqual(max_rel_diff, self.max_rel_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedAdam(TestFusedOptimizer): class TestFusedAdam(TestFusedOptimizer):
...@@ -106,14 +100,6 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -106,14 +100,6 @@ class TestFusedAdam(TestFusedOptimizer):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types
def test_bfloat16(self):
self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment