test_adagrad.py 5.13 KB
Newer Older
Andrew Tulloch's avatar
Andrew Tulloch committed
1
2
3
4
import unittest

import apex
import torch
lcskrishna's avatar
lcskrishna committed
5
from apex.testing.common_utils import skipIfRocm
Andrew Tulloch's avatar
Andrew Tulloch committed
6
7
8
9
10
11
12
13
14
15
16

class TestFusedAdagrad(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-6, max_rel_diff=1, iters=7):
        self.max_abs_diff = max_abs_diff
        self.max_rel_diff = max_rel_diff
        self.iters = iters
        torch.cuda.manual_seed(9876)

    def tearDown(self):
        pass

rohithkrn's avatar
rohithkrn committed
17
    def gen_param_optim(self, tensors, adagrad_option, apex_only=False):
Andrew Tulloch's avatar
Andrew Tulloch committed
18
19
20
        ref_param = []
        tst_param = []
        for tensor in tensors:
rohithkrn's avatar
rohithkrn committed
21
22
23
24
            if apex_only:
                ref_param.append(torch.nn.Parameter(tensor.clone().float()))
            else:
                ref_param.append(torch.nn.Parameter(tensor.clone()))
Andrew Tulloch's avatar
Andrew Tulloch committed
25
26
            tst_param.append(torch.nn.Parameter(tensor.clone()))

rohithkrn's avatar
rohithkrn committed
27
28
29
30
        if apex_only:
            ref_optim = apex.optimizers.FusedAdagrad(ref_param, **adagrad_option)
        else:
            ref_optim = torch.optim.Adagrad(ref_param, **adagrad_option)
Andrew Tulloch's avatar
Andrew Tulloch committed
31
32
33
34
        tst_optim = apex.optimizers.FusedAdagrad(tst_param, **adagrad_option)

        return (ref_param, tst_param, ref_optim, tst_optim)

rohithkrn's avatar
rohithkrn committed
35
    def gen_grad(self, ref_param, tst_param, apex_only=False):
Andrew Tulloch's avatar
Andrew Tulloch committed
36
        for p_ref, p_tst in zip(ref_param, tst_param):
rohithkrn's avatar
rohithkrn committed
37
38
            p_tst.grad = torch.rand_like(p_tst)
            p_ref.grad = p_tst.grad.detach().float() if apex_only else p_tst.grad
Andrew Tulloch's avatar
Andrew Tulloch committed
39
40
41
42
43
44
45
46

    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
        half_grads = []
        for p_ref, _ in zip(ref_param, tst_param):
            half_grads.append(torch.rand_like(p_ref).half())
            p_ref.grad = half_grads[-1].float() / scale
        return half_grads

rohithkrn's avatar
rohithkrn committed
47
    def get_max_diff(self, ref_param, tst_param, apex_only=False):
Andrew Tulloch's avatar
Andrew Tulloch committed
48
49
        max_abs_diff = max_rel_diff = 0
        for p_ref, p_tst in zip(ref_param, tst_param):
rohithkrn's avatar
rohithkrn committed
50
51
            if apex_only:
                p_tst = p_tst.float()
Andrew Tulloch's avatar
Andrew Tulloch committed
52
53
54
55
56
57
58
59
60
61
            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()

            if max_abs_diff_p > max_abs_diff:
                max_abs_diff = max_abs_diff_p
            if max_rel_diff_p > max_rel_diff:
                max_rel_diff = max_rel_diff_p

        return max_abs_diff, max_rel_diff

rohithkrn's avatar
rohithkrn committed
62
    def gen_single_type_test(self, param_type=torch.float, apex_only=False):
Andrew Tulloch's avatar
Andrew Tulloch committed
63
64
65
66
67
        nelem = 278011
        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}

        tensor = torch.rand(nelem, dtype=param_type, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
rohithkrn's avatar
rohithkrn committed
68
            [tensor], adagrad_option, apex_only=apex_only
Andrew Tulloch's avatar
Andrew Tulloch committed
69
70
71
        )

        for _ in range(self.iters):
rohithkrn's avatar
rohithkrn committed
72
            self.gen_grad(ref_param, tst_param, apex_only=apex_only)
Andrew Tulloch's avatar
Andrew Tulloch committed
73
74
            ref_optim.step()
            tst_optim.step()
rohithkrn's avatar
rohithkrn committed
75
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
Andrew Tulloch's avatar
Andrew Tulloch committed
76
77

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
rohithkrn's avatar
rohithkrn committed
78
79
            if not apex_only:
                self.assertLessEqual(max_rel_diff, self.max_rel_diff)
Andrew Tulloch's avatar
Andrew Tulloch committed
80

lcskrishna's avatar
lcskrishna committed
81
    @skipIfRocm
Andrew Tulloch's avatar
Andrew Tulloch committed
82
83
84
85
86
87
88
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

    @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

rohithkrn's avatar
rohithkrn committed
89
90
91
92
    # Compares bfloat16 computation against float32 as gold standard.
    # Uses apex optimizers(controlled by apex_only flag) for both types.
    # Doesn't use upstream optimizer like other tests as they seem to be
    # numerically unstable for half types(see skip note for test above).
lcskrishna's avatar
lcskrishna committed
93
    @skipIfRocm
rohithkrn's avatar
rohithkrn committed
94
95
96
97
    def test_bfloat16(self):
        self.max_abs_diff = 1e-2
        self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)

lcskrishna's avatar
lcskrishna committed
98
    @skipIfRocm
Andrew Tulloch's avatar
Andrew Tulloch committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}

        tensors = []
        for size in sizes:
            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
            tensors, adagrad_option
        )

        for _ in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)

    def test_adagrad_option(self):
        nelem = 1
        adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}

        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
            [tensor], adagrad_option
        )

        for _ in range(self.iters):
            self.gen_grad(ref_param, tst_param)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)