mlp.py 2.17 KB
Newer Older
1
2
3
4
5
6
7
# Copyright (c) 2022, Tri Dao.

import torch
import torch.nn as nn
import torch.nn.functional as F

try:
8
    from flash_attn.ops.fused_dense import FusedMLP, ParallelFusedMLP
9
except ImportError:
10
    FusedMLP, ParallelFusedMLP = None, None
11
12
13
14
15


class Mlp(nn.Module):

    def __init__(self, in_features, hidden_features=None, out_features=None, activation=F.gelu,
Tri Dao's avatar
Tri Dao committed
16
                 bias1=True, bias2=True, return_residual=False, device=None, dtype=None):
17
18
19
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        out_features = out_features or in_features
20
        hidden_features = hidden_features or in_features * 4
21
        self.return_residual = return_residual
Tri Dao's avatar
Tri Dao committed
22
        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
23
        self.activation = activation
Tri Dao's avatar
Tri Dao committed
24
        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
25
26

    def forward(self, x):
27
28
29
30
        y = self.fc1(x)
        y = self.activation(y)
        y = self.fc2(y)
        return y if not self.return_residual else (y, x)
Tri Dao's avatar
Tri Dao committed
31
32
33
34
35


class GatedMlp(nn.Module):

    def __init__(self, in_features, hidden_features=None, out_features=None, activation=F.sigmoid,
Tri Dao's avatar
Tri Dao committed
36
37
                 bias1=True, bias2=True, multiple_of=256, return_residual=False,
                 device=None, dtype=None):
Tri Dao's avatar
Tri Dao committed
38
39
40
41
42
43
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or int(8 * in_features / 3)
        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
        self.return_residual = return_residual
Tri Dao's avatar
Tri Dao committed
44
        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
Tri Dao's avatar
Tri Dao committed
45
        self.activation = activation
Tri Dao's avatar
Tri Dao committed
46
        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias1, **factory_kwargs)
Tri Dao's avatar
Tri Dao committed
47
48
49
50
51
52
53
54
55
56

    def forward(self, x):
        y = self.fc1(x)
        if self.activation == F.sigmoid:  # Special case for GLU
            y = F.glu(y, dim=-1)
        else:
            y, gate = y.chunk(2, dim=-1)
            y = y * self.activation(gate)
        y = self.fc2(y)
        return y if not self.return_residual else (y, x)