test_faster_shadow.py 3.03 KB
Newer Older
Rick Ho's avatar
Rick Ho committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import pytest

import os
import sys
import json
import math

import torch
import torch.distributed as dist
import torch.nn.functional as F
from fmoe.functions import ensure_comm
from test_ddp import _ensure_initialized, _run_distributed
from test_numerical import _assert_numerical
from fmoe.fastermoe.schedule import _fmoe_general_global_forward as smart_fwd
from fmoe.layers import _fmoe_general_global_forward as naive_fwd


@pytest.mark.parametrize("n_process", [8])
@pytest.mark.parametrize("d_model", [1024])
Rick Ho's avatar
Rick Ho committed
20
@pytest.mark.parametrize("batch_size", [16, 512])
Rick Ho's avatar
Rick Ho committed
21
22
@pytest.mark.parametrize("n_expert", [1])
@pytest.mark.parametrize("group_sz", [1, 2, 4])
Rick Ho's avatar
Rick Ho committed
23
@pytest.mark.parametrize("pass_stored", [True, False])
Rick Ho's avatar
Rick Ho committed
24
def test_faster_shadow(n_process, d_model, batch_size, n_expert, group_sz, pass_stored):
Rick Ho's avatar
Rick Ho committed
25
26
27
28
29
    _run_distributed('_test_faster_shadow',
            n_process,
            {
                'd_model': d_model,
                'batch_size': batch_size,
Rick Ho's avatar
Rick Ho committed
30
31
                'n_expert': n_expert,
                'pass_stored': pass_stored
Rick Ho's avatar
Rick Ho committed
32
33
34
            },
            script=__file__,
            env=dict(
Rick Ho's avatar
Rick Ho committed
35
36
                FMOE_FASTER_GROUP_SIZE=str(group_sz),
                FMOE_FASTER_SHADOW_ENABLE='ON'
Rick Ho's avatar
Rick Ho committed
37
38
39
40
            )
    )


Rick Ho's avatar
Rick Ho committed
41
def _test_faster_shadow(d_model, batch_size, n_expert, pass_stored):
Rick Ho's avatar
Rick Ho committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    _ensure_initialized()
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    x1 = torch.rand(batch_size, d_model).cuda()
    x1.requires_grad = True
    x2 = x1.data.clone()
    x2.requires_grad = True
    topk_idx = torch.randint(0, world_size * n_expert, (batch_size, 2)).cuda()
    m1 = torch.nn.Linear(d_model, d_model).cuda()
    m2 = torch.nn.Linear(d_model, d_model).cuda()
    with torch.no_grad():
        m2.weight.copy_(m1.weight)
        m2.bias.copy_(m1.bias)

Rick Ho's avatar
Rick Ho committed
57
    def ef1(x, fec, eidx):
Rick Ho's avatar
Rick Ho committed
58
59
60
61
62
63
        y = m1(x)
        return y
    def ef2(x, fec):
        y = m2(x)
        return y

Rick Ho's avatar
Rick Ho committed
64
    if pass_stored:
Rick Ho's avatar
Rick Ho committed
65
66
67
68
        stored_models = torch.randint(0, 2, (world_size * n_expert,)).bool().cuda()
        while stored_models.sum().item() == 0:
            stored_models = torch.randint(0, 2, (world_size * n_expert,)).bool().cuda()
        stored_models[-1] = True
Rick Ho's avatar
Rick Ho committed
69
70
        dist.broadcast(stored_models, 0)
        stored_models = stored_models.cpu()
Rick Ho's avatar
Rick Ho committed
71
        print(stored_models)
Rick Ho's avatar
Rick Ho committed
72

Rick Ho's avatar
Rick Ho committed
73
    ensure_comm(x1, None)
Rick Ho's avatar
Rick Ho committed
74
    if pass_stored:
Rick Ho's avatar
Rick Ho committed
75
        y1 = smart_fwd(x1, topk_idx, ef1, n_expert, world_size, experts=[m1],
Rick Ho's avatar
Rick Ho committed
76
77
                stored_models=stored_models)
    else:
Rick Ho's avatar
Rick Ho committed
78
        y1 = smart_fwd(x1, topk_idx, ef1, n_expert, world_size, experts=[m1])
Rick Ho's avatar
Rick Ho committed
79
    y1.sum().backward()
Rick Ho's avatar
Rick Ho committed
80

Rick Ho's avatar
Rick Ho committed
81
    y2 = naive_fwd(x2, topk_idx, ef2, n_expert, world_size, experts=[m2])
Rick Ho's avatar
Rick Ho committed
82
83
84
85
    y2.sum().backward()
    _assert_numerical(['out', 'grad_in', 'grad_bias', 'grad_weight'],
            [y1, x1.grad, m1.bias.grad, m1.weight.grad],
            [y2, x2.grad, m2.bias.grad, m2.weight.grad], rank)
Rick Ho's avatar
Rick Ho committed
86
87
88
89
90
91
92
93


if __name__ == '__main__':
    if len(sys.argv) >= 3:
        args = json.loads(sys.argv[2])
        locals()[sys.argv[1]](**args)
    else:
        # test_faster_shadow(8, 16, 16, 1, 2)
Rick Ho's avatar
Rick Ho committed
94
        _test_faster_shadow(1024, 16, 1, True)