test_cpu_adam.py 3.66 KB
Newer Older
LuGY's avatar
LuGY committed
1
import math
2

LuGY's avatar
LuGY committed
3
import torch
4
5

from colossalai.testing import parameterize
LuGY's avatar
LuGY committed
6

7

LuGY's avatar
LuGY committed
8
9
10
11
12
13
14
15
16
17
18
19
20
def torch_adam_update(
    step,
    lr,
    beta1,
    beta2,
    eps,
    weight_decay,
    param,
    grad,
    exp_avg,
    exp_avg_sq,
    use_adamw,
):
21
22
    bias_correction1 = 1 - beta1**step
    bias_correction2 = 1 - beta2**step
LuGY's avatar
LuGY committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

    if weight_decay != 0:
        if use_adamw:
            # Perform stepweight decay
            param.mul_(1 - lr * weight_decay)
        else:
            grad = grad.add(param, alpha=weight_decay)

    # Decay the first and second moment running average coefficient
    exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)

    step_size = lr / bias_correction1

    param.addcdiv_(exp_avg, denom, value=-step_size)


41
42
43
44
45
46
47
48
49
50
51
52
def assertLess(data_diff, threshold, msg):
    assert data_diff < threshold, msg


def assertTrue(condition, msg):
    assert condition, msg


@parameterize('adamw', [True, False])
@parameterize('step', [1, 2])
@parameterize('p_dtype', [torch.float, torch.half])
@parameterize('g_dtype', [torch.float, torch.half])
53
def test_cpu_adam(adamw, step, p_dtype, g_dtype):
54
55
56
57
    lr = 1e-3
    beta1, beta2 = 0.9, 0.999
    eps = 1e-8
    weight_decay = 0
58

59
60
    for i in range(1024):
        p_data = torch.rand(64, dtype=p_dtype)
LuGY's avatar
LuGY committed
61
        p_data_copy = p_data.clone().float()
62
        p_grad = torch.rand(64, dtype=g_dtype)
LuGY's avatar
LuGY committed
63
        p_grad_copy = p_grad.clone().float()
64
        exp_avg = torch.rand(p_data.shape)
LuGY's avatar
LuGY committed
65
        exp_avg_copy = exp_avg.clone()
66
        exp_avg_sq = torch.rand(p_data.shape)
LuGY's avatar
LuGY committed
67
68
        exp_avg_sq_copy = exp_avg_sq.clone()

69
        try:
70
71
            import colossalai._C.cpu_optim
            cpu_adam_op = colossalai._C.cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)
72
            print("use prebuilt CPUAdamOptimizer")
73
        except:
74
75
76
77
            from colossalai.kernel.op_builder.cpu_adam import CPUAdamBuilder
            lib = CPUAdamBuilder().load()
            cpu_adam_op = lib.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)
            print("build CPUAdamOptimizer at runtime")
78

79
        cpu_adam_op.step(
LuGY's avatar
LuGY committed
80
81
82
83
84
85
86
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
            True,
87
88
            p_data.view(-1),    # fp32 data
            p_grad.view(-1),    # fp32 grad
LuGY's avatar
LuGY committed
89
90
            exp_avg.view(-1),
            exp_avg_sq.view(-1),
91
            -1,
LuGY's avatar
LuGY committed
92
93
94
95
96
97
98
99
100
        )

        torch_adam_update(
            step,
            lr,
            beta1,
            beta2,
            eps,
            weight_decay,
101
102
            p_data_copy,    # fp32 data
            p_grad_copy,    # fp32 grad
LuGY's avatar
LuGY committed
103
104
            exp_avg_copy,
            exp_avg_sq_copy,
105
            adamw,
LuGY's avatar
LuGY committed
106
107
108
        )
        var = p_data_copy - p_data
        data_diff = torch.max(torch.abs(var))
109
110
        threshold = 1e-3
        assertLess(
LuGY's avatar
LuGY committed
111
112
            data_diff,
            threshold,
113
            f"p_data diff {data_diff}. failed check, step {step}, lr {lr}, eps "
114
            f"{eps} beta1 {beta1} beta2 {beta2} weight_decay {weight_decay} p_dtype {p_dtype}, g_dtype {g_dtype}",
LuGY's avatar
LuGY committed
115
116
        )
        max_grad_diff = torch.max(torch.abs(p_grad_copy - p_grad))
117
        assertTrue(max_grad_diff < threshold, f"diff {max_grad_diff}")
LuGY's avatar
LuGY committed
118
        max_exp_avg_diff = torch.max(torch.abs(exp_avg_copy - exp_avg))
119
        assertTrue(max_exp_avg_diff < threshold, f"max_exp_avg_diff {max_exp_avg_diff}")
LuGY's avatar
LuGY committed
120
        max_exp_avg_sq_diff = torch.max(torch.abs(exp_avg_sq_copy - exp_avg_sq))
121
        assertTrue(max_exp_avg_sq_diff < threshold, f"max_exp_avg_sq_diff {max_exp_avg_sq_diff}")
122
123
124
125


if __name__ == '__main__':
    test_cpu_adam()