test_chunkv2.py 4.18 KB
Newer Older
1
2
3
4
import torch
import colossalai
import pytest
import torch.multiprocessing as mp
5
import torch.distributed as dist
6
7
8
9
10
from functools import partial
from colossalai.testing import rerun_if_address_is_in_use, parameterize
from colossalai.utils import free_port, get_current_device
from colossalai.tensor import ProcessGroup as ColoProcessGroup
from colossalai.tensor import ColoParameter
11
from colossalai.gemini import TensorState
12
from colossalai.gemini.update import ChunkV2
13
14


15
16
17
18
19
20
def dist_sum(x):
    temp = torch.tensor([x], device=get_current_device())
    dist.all_reduce(temp)
    return temp.item()


21
def add_param(param_list, param_cp_list, *args, **kwargs):
22
    param = ColoParameter(torch.randn(*args, **kwargs))
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
    param_list.append(param)
    param_cp_list.append(param.clone())


def check_euqal(param, param_cp):
    if param.device != param_cp.device:
        temp = param.data.to(param_cp.device)
    else:
        temp = param.data
    return torch.equal(temp, param_cp.data)


@parameterize('init_device', [None, torch.device('cpu')])
@parameterize('keep_gathered', [True, False])
@parameterize('pin_memory', [True, False])
38
def exam_chunk_basic(init_device, keep_gathered, pin_memory):
39
40
    world_size = torch.distributed.get_world_size()
    pg = ColoProcessGroup()
41
    my_chunk = ChunkV2(
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
        chunk_size=1024,
        process_group=pg,
        dtype=torch.float32,
        init_device=init_device,
        keep_gathered=keep_gathered,
        pin_memory=pin_memory
    )

    param_list = []
    param_cp_list = []

    add_param(param_list, param_cp_list, 8, 8, 8, device='cuda')
    add_param(param_list, param_cp_list, 4, 4)
    add_param(param_list, param_cp_list, 4, 8, 2, device='cuda')
    add_param(param_list, param_cp_list, 1, 1, 5)

    for param in param_list:
        my_chunk.append_tensor(param)
    assert my_chunk.utilized_size == 597
    for param, param_cp in zip(param_list, param_cp_list):
        check_euqal(param, param_cp)
    my_chunk.close_chunk()

    if keep_gathered is False:
        assert my_chunk.cpu_shard.size(0) == 1024 // world_size
67
68
        assert my_chunk.device_type == 'cpu'
        assert my_chunk.can_move
69
        my_chunk.shard_move(get_current_device())
70
71
72
73
    else:
        assert my_chunk.chunk_total.size(0) == 1024
        assert my_chunk.device_type == 'cuda'
        assert not my_chunk.can_move
74

75
76
77
    assert dist_sum(my_chunk.valid_end) == my_chunk.utilized_size
    flag = my_chunk.has_inf_or_nan
    assert not flag, "has_inf_or_nan is {}".format(flag)
78

79
80
    my_chunk.access_chunk()
    assert my_chunk.device_type == 'cuda'
81
82
83
    for param, param_cp in zip(param_list, param_cp_list):
        check_euqal(param, param_cp)

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    assert my_chunk.tensors_state_monitor[TensorState.HOLD] == 4
    my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)
    assert my_chunk.tensors_state_monitor[TensorState.HOLD] == 3
    assert my_chunk.tensors_state_monitor[TensorState.COMPUTE] == 1
    assert not my_chunk.can_release

    for param in param_list:
        my_chunk.tensor_trans_state(param, TensorState.COMPUTE)
        my_chunk.tensor_trans_state(param, TensorState.READY_FOR_REDUCE)

    assert my_chunk.tensors_state_monitor[TensorState.READY_FOR_REDUCE] == 4
    assert my_chunk.can_reduce
    my_chunk.reduce()
    assert my_chunk.tensors_state_monitor[TensorState.HOLD] == 4

    if keep_gathered is False:
        assert my_chunk.cuda_shard.size(0) == 1024 // world_size
        assert my_chunk.device_type == 'cuda'
        assert my_chunk.can_move
    else:
        assert my_chunk.chunk_total.size(0) == 1024
        assert my_chunk.device_type == 'cuda'
        assert not my_chunk.can_move

108
109
110

def run_dist(rank, world_size, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
111
    exam_chunk_basic()
112
113
114
115
116
117
118
119
120
121
122


@pytest.mark.dist
@pytest.mark.parametrize('world_size', [1, 2, 4])
@rerun_if_address_is_in_use()
def test_chunk_function(world_size):
    run_func = partial(run_dist, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
123
    test_chunk_function(4)