megatron.py 4.01 KB
Newer Older
1
r'''
Rick Ho's avatar
Rick Ho committed
2
3
The adaptor to seamlessly enable FastMoE in Megatron-LM v2.0 with at most two
lines of modification.
Rick Ho's avatar
Rick Ho committed
4
See `examples/megatron` for usage instructions.
Rick Ho's avatar
Rick Ho committed
5
'''
Rick Ho's avatar
Rick Ho committed
6
from .transformer import FMoETransformerMLP
7
from .distributed import DistributedGroupedDataParallel
8
from .utils import get_torch_default_comm
Rick Ho's avatar
Rick Ho committed
9

Rick Ho's avatar
Rick Ho committed
10

Rick Ho's avatar
Rick Ho committed
11
class MegatronMLP(FMoETransformerMLP):
Rick Ho's avatar
Rick Ho committed
12
13
14
15
    r'''
    Make the FMoETransformerMLP layer that distributes experts across
    communication group `group` to replace the original MLP layer in Megatron.
    '''
Rick Ho's avatar
Rick Ho committed
16
17
18
19
20
21
22
23
24
    def __init__(self, args, group):
        assert (args.seq_length * args.micro_batch_size
                % args.tensor_model_parallel_size == 0
        ), "Batch size x sequence length should be multiple of mp size"
        if not args.distributed_experts:
            world_size = 1
        else:
            world_size = args.world_size
        super().__init__(args.num_experts,
25
                top_k=args.top_k,
26
                d_model=args.hidden_size, d_hidden=args.hidden_hidden_size,
27
28
                world_size=world_size, mp_group=group,
                expert_dp_comm='none' if args.distributed_experts else 'dp')
Rick Ho's avatar
Rick Ho committed
29
    def forward(self, inp):
30
31
32
        output = super().forward(inp)
        bias = output.new_zeros(output.size(-1), requires_grad=False)
        return output, bias
Rick Ho's avatar
fmoefy  
Rick Ho committed
33
34


35
def fmoefy(model, num_experts=None, distributed_experts=True,
36
        hidden_hidden_size=None, top_k=None):
Rick Ho's avatar
Rick Ho committed
37
38
39
40
41
42
43
44
45
46
    r'''
    Replace MLP layers in a transformer-based model in Megatron by MoE.
    * `model` should be a standard Megatron model that has
    `model.language_model.transformer.layers` as transformer layers, which is an
    array of transformer blocks that contain an `mlp` member.
    * `distributed_expert` is set to True if different experts are located in
    different workers. Otherwise, the experts on the workers are identical, and
    they are trained in data-parallel mode. This can be useful when testing on
    small models that do not require high training throughput or large parameter
    capacity.
47
48
49
    Note that pipeline parallel is not supported yet. When distributed experts
    are enabled, their communicator should be Megatron's
    tensor_model_parall_comm x data_parallel_comm, which is not created.
Rick Ho's avatar
Rick Ho committed
50
    '''
Rick Ho's avatar
fmoefy  
Rick Ho committed
51
52
53
54
55
56
57
    from megatron import get_args
    args = get_args()
    if num_experts is not None:
        args.num_experts = num_experts
    assert (
        'num_experts' in args
    ), 'num_experts should be specified in arguments or fmoefy function'
58

59
60
61
62
63
    if hidden_hidden_size is not None:
        args.hidden_hidden_size = hidden_hidden_size
    elif not hasattr(args, 'hidden_hidden_size'):
        args.hidden_hidden_size = args.hidden_size * 4

64
65
66
67
68
    if top_k is not None:
        args.top_k = top_k
    elif not hasattr(args, 'top_k'):
        args.top_k = 2

69
70
71
72
    # Set distributed_experts to None to use default setting in args
    if distributed_experts is not None:
        args.distributed_experts = distributed_experts

Rick Ho's avatar
fmoefy  
Rick Ho committed
73
    for l in model.language_model.transformer.layers:
Rick Ho's avatar
Rick Ho committed
74
        l.mlp = MegatronMLP(args, get_torch_default_comm())
Rick Ho's avatar
fmoefy  
Rick Ho committed
75
    return model
76
77
78


class DistributedDataParallel(DistributedGroupedDataParallel):
Rick Ho's avatar
Rick Ho committed
79
80
81
82
83
    r'''
    A wrapper that is used to replace the DDP module provided by Megatron, which
    is adapted to enable the sophiscated parallel and reduction strategies in
    Fast MoE.
    '''
84
85
    def __init__(self, module):
        from megatron import mpu
Rick Ho's avatar
Rick Ho committed
86
        super().__init__(
87
88
89
90
            module,
            mp_group=mpu.get_model_parallel_group(),
            dp_group=mpu.get_data_parallel_group()
        )
91
92

    def state_dict(self, *args, **kwargs):
Rick Ho's avatar
Rick Ho committed
93
94
95
        r'''
        Keep consitency with Megatron
        '''
96
97
98
        return self.module.state_dict(*args, **kwargs)

    def state_dict_for_save_checkpoint(self, *args, **kwargs):
Rick Ho's avatar
Rick Ho committed
99
100
101
        r'''
        Keep consitency with Megatron
        '''
102
103
104
        return self.module.state_dict_for_save_checkpoint(*args, **kwargs)

    def load_state_dict(self, *args, **kwargs):
Rick Ho's avatar
Rick Ho committed
105
106
107
        r'''
        Keep consitency with Megatron
        '''
108
        return self.module.load_state_dict(*args, **kwargs)