test_moe_zero_optim.py 4.93 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from functools import partial

import pytest
import torch
import torch.multiprocessing as mp

import colossalai
from colossalai.amp import convert_to_apex_amp
from colossalai.context import MOE_CONTEXT
from colossalai.engine.gradient_handler import MoeGradientHandler
from colossalai.nn import MoeLoss
from colossalai.nn.optimizer import CPUAdam
from colossalai.testing import assert_equal_in_group, parameterize, rerun_if_address_is_in_use
from colossalai.utils import free_port, get_current_device
from colossalai.zero.init_ctx import ZeroInitContext
from colossalai.zero.shard_utils import BucketTensorShardStrategy, TensorShardStrategy
from colossalai.zero.sharded_model import ShardedModelV2
from colossalai.zero.sharded_model.utils import col_model_deepcopy
from colossalai.zero.sharded_optim import ShardedOptimizerV2
from colossalai.zero.sharded_optim._utils import has_inf_or_nan
from tests.components_to_test.registry import non_distributed_component_funcs
from tests.test_moe.test_moe_zero_init import MoeModel
from tests.test_zero.common import CONFIG, check_sharded_model_params


def _run_step(model, optimizer, data, label, criterion, grad_handler):
    model.train()
    optimizer.zero_grad()

    if criterion:
        y = model(data)
        loss = criterion(y, label)
    else:
        loss = model(data, label)

    loss = loss.float()
    if isinstance(model, ShardedModelV2):
        optimizer.backward(loss)
    else:
        loss.backward()

    if grad_handler is not None:
        grad_handler.handle_gradient()

    optimizer.step()


@parameterize("cpu_offload", [True])
@parameterize("use_cpuadam", [True])    # We do not use Hybrid Adam right now, since it has a little bug
@parameterize("reuse_fp16_shard", [True, False])
@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def _run_test_sharded_optim_v2(cpu_offload,
                               shard_strategy_class,
                               use_cpuadam,
                               reuse_fp16_shard,
                               gpu_margin_mem_ratio=0.0):
    shard_strategy = shard_strategy_class()
    if use_cpuadam and cpu_offload is False:
        return
    MOE_CONTEXT.reset_loss()
    get_components_func = non_distributed_component_funcs.get_callable('hanging_param_model')
    _, train_dataloader, _, optimizer_class, _ = get_components_func()
    criterion = MoeLoss(aux_weight=0.01, loss_fn=torch.nn.CrossEntropyLoss)

    with ZeroInitContext(target_device=torch.device('cpu') if cpu_offload else get_current_device(),
                         shard_strategy=shard_strategy,
                         shard_param=True):
        zero_model = MoeModel(checkpoint=True)

    zero_model = ShardedModelV2(zero_model,
                                shard_strategy,
                                tensor_placement_policy='cpu' if cpu_offload else 'cuda',
                                reuse_fp16_shard=reuse_fp16_shard)

    # check whether parameters are identical in ddp
    for name, p in zero_model.named_parameters():
        if not p.colo_attr.param_is_sharded and p.colo_attr.is_replicated:
            assert_equal_in_group(p.colo_attr.data_payload.to(get_current_device()))

    model = MoeModel(checkpoint=True).half()
    col_model_deepcopy(zero_model, model)
    model = model.cuda().float()

    if use_cpuadam:
        optimizer_class = CPUAdam
    optim = optimizer_class(model.parameters(), lr=1e-3)
    sharded_optim = optimizer_class(zero_model.parameters(), lr=1e-3)
    sharded_optim = ShardedOptimizerV2(zero_model,
                                       sharded_optim,
                                       initial_scale=2**5,
                                       gpu_margin_mem_ratio=gpu_margin_mem_ratio)

    amp_config = dict(opt_level='O2', keep_batchnorm_fp32=False)
    apex_model, apex_optimizer = convert_to_apex_amp(model, optim, amp_config)
    apex_grad_handler = MoeGradientHandler(model)

    for i, (data, label) in enumerate(train_dataloader):
        if i > 5:
            break
        data, label = data.cuda(), label.cuda()
        _run_step(apex_model, apex_optimizer, data, label, criterion, apex_grad_handler)
        _run_step(zero_model, sharded_optim, data, label, criterion, None)
        check_sharded_model_params(model, zero_model, loose=True, reuse_fp16_shard=use_cpuadam)
        for param in model.parameters():
            assert not has_inf_or_nan(param)


def _run_dist(rank, world_size, port):
    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    MOE_CONTEXT.setup(seed=42)
    _run_test_sharded_optim_v2()


# use_cpuadam = True can be used with cpu_offload = False
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [2])
@rerun_if_address_is_in_use()
def test_moe_zero_optim(world_size):
    run_func = partial(_run_dist, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    test_moe_zero_optim(world_size=4)