util.py 1.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from colossalai.context import ParallelMode
from colossalai.nn.layer import WrappedDropout as Dropout


def moe_sa_args(d_model: int,
                n_heads: int,
                d_kv: int,
                attention_drop: float = 0,
                drop_rate: float = 0,
                bias: bool = True):
    """This is an example for args in moe self attention, since lots of modules should be
    adapted before putting them in experts.
    """
    dropout1 = Dropout(attention_drop, mode=ParallelMode.TENSOR)
    dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
    return dict(
        d_model=d_model,
        n_heads=n_heads,
        d_kv=d_kv,
        bias=bias,
        dropout1=dropout1,
        dropout2=dropout2
    )


def moe_mlp_args(d_model: int,
                 d_ff: int,
                 drop_rate: float,
                 bias: bool = True):
    """This is an example for args of MLP in Experts, since lots of modules should be adapted
    before putting them in experts.
    """
    dropout1 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
    dropout2 = Dropout(drop_rate, mode=ParallelMode.TENSOR)
    return dict(
        d_model=d_model,
        d_ff=d_ff,
        bias=bias,
        dropout1=dropout1,
        dropout2=dropout2
    )