unittest_cpu_adam.py 6.62 KB
Newer Older
LuGY's avatar
LuGY committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# BSD 3-Clause License
#
# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
#  * Neither the name of the psutil authors nor the names of its contributors
#    may be used to endorse or promote products derived from this software without
#    specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import math
import torch
try:
    import cpu_adam
except ImportError:
    raise ImportError("import cpu_adam error")

37

LuGY's avatar
LuGY committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def torch_adam_update(
    step,
    lr,
    beta1,
    beta2,
    eps,
    weight_decay,
    param,
    grad,
    exp_avg,
    exp_avg_sq,
    loss_scale,
    use_adamw,
):
    if loss_scale > 0:
        grad.div_(loss_scale)
54
55
    bias_correction1 = 1 - beta1**step
    bias_correction2 = 1 - beta2**step
LuGY's avatar
LuGY committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

    if weight_decay != 0:
        if use_adamw:
            # Perform stepweight decay
            param.mul_(1 - lr * weight_decay)
        else:
            grad = grad.add(param, alpha=weight_decay)

    # Decay the first and second moment running average coefficient
    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

    step_size = lr / bias_correction1

    param.addcdiv_(exp_avg, denom, value=-step_size)


class Test():
75

LuGY's avatar
LuGY committed
76
77
    def __init__(self):
        self.opt_id = 0
78

LuGY's avatar
LuGY committed
79
80
    def assertLess(self, data_diff, threshold, msg):
        assert data_diff < threshold, msg
81

LuGY's avatar
LuGY committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    def assertTrue(self, condition, msg):
        assert condition, msg

    def check_res(
        self,
        step,
        lr,
        eps,
        beta1,
        beta2,
        weight_decay,
        shape,
        grad_dtype,
        loss_scale,
        use_adamw,
        cpu_adam_op,
    ):
        p_data = torch.rand(shape, dtype=grad_dtype)
        p_data_copy = p_data.clone().float()
        p_grad = torch.rand(shape, dtype=grad_dtype)
        if loss_scale > 0:
            p_grad.mul_(loss_scale)
        p_grad_copy = p_grad.clone().float()
        exp_avg = torch.rand(shape)
        exp_avg_copy = exp_avg.clone()
        exp_avg_sq = torch.rand(shape)
        exp_avg_sq_copy = exp_avg_sq.clone()

        cpu_adam_op.create_adam(0, lr, beta1, beta2, eps, weight_decay, use_adamw, True)
        cpu_adam_op.adam_update(
            self.opt_id,
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
            True,
120
121
            p_data.view(-1),    # fp32 data
            p_grad.view(-1),    # fp32 grad
LuGY's avatar
LuGY committed
122
123
124
125
126
127
128
129
130
131
132
133
            exp_avg.view(-1),
            exp_avg_sq.view(-1),
            loss_scale,
        )

        torch_adam_update(
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
134
135
            p_data_copy,    # fp32 data
            p_grad_copy,    # fp32 grad
LuGY's avatar
LuGY committed
136
137
138
139
140
            exp_avg_copy,
            exp_avg_sq_copy,
            loss_scale,
            use_adamw,
        )
141

LuGY's avatar
LuGY committed
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
        if loss_scale > 0:
            p_grad.div_(loss_scale)

        var = p_data_copy - p_data
        data_diff = torch.max(torch.abs(var))
        threshold = 2e-3 if grad_dtype else 1e-4
        self.assertLess(
            data_diff,
            threshold,
            f"p_data diff {data_diff}. failed check, step {step}, lr {lr} eps "
            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} loss_scale {loss_scale} grad_dtype {grad_dtype}",
        )
        max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))
        self.assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
        max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))
        self.assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
        max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))
159
        self.assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")
LuGY's avatar
LuGY committed
160
161
162
163
164
165

    def test_cpu_adam(self):
        lr = 0.9
        eps = 1e-6
        weight_decay = 0
        for use_adamw in [False, True]:
166
            for shape in [(23,), (8, 24)]:
LuGY's avatar
LuGY committed
167
168
169
170
171
172
173
                for step in range(1, 2):
                    for lr in [0.01]:
                        for eps in [1e-8]:
                            for beta1 in [0.9]:
                                for beta2 in [0.999]:
                                    for weight_decay in [0.001]:
                                        for grad_dtype in [torch.half, torch.float]:
174
                                            for loss_scale in [-1, 2**5]:
LuGY's avatar
LuGY committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
                                                self.check_res(
                                                    step,
                                                    lr,
                                                    eps,
                                                    beta1,
                                                    beta2,
                                                    weight_decay,
                                                    shape,
                                                    grad_dtype,
                                                    loss_scale,
                                                    use_adamw,
                                                    cpu_adam,
                                                )


190
191
192
193
194
def test_cpu_adam():
    test_case = Test()
    test_case.test_cpu_adam()


LuGY's avatar
LuGY committed
195
196
197
if __name__ == "__main__":
    test = Test()
    test.test_cpu_adam()