mlp.py 2.37 KB
Newer Older
Haotian Tang's avatar
Haotian Tang committed
1
2
3
import torch
import torch.nn as nn
import awq_inference_engine
Casper Hansen's avatar
Casper Hansen committed
4
import torch.nn.functional as F
Haotian Tang's avatar
Haotian Tang committed
5

6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class QuantMPTMLP(nn.Module):
    def __init__(
        self,
        up_proj,
        act,
        down_proj
    ):
        super().__init__()
        self.register_buffer('up_proj_qweight', up_proj.qweight)
        self.register_buffer('up_proj_scales', up_proj.scales)
        self.register_buffer('up_proj_qzeros', up_proj.qzeros)

        self.up_proj = up_proj
        self.act = act
        self.down_proj = down_proj
    
    def forward(self, x: torch.Tensor):
        x = x.reshape(-1, x.shape[-1])
Casper Hansen's avatar
Casper Hansen committed
24
25
26
27
28
29
30
        x = awq_inference_engine.gemv_forward_cuda(
            x, 
            self.up_proj_qweight, 
            self.up_proj_scales, 
            self.up_proj_qzeros, 
            self.down_proj.group_size
        )
31
32

        return self.down_proj(self.act(x))
Haotian Tang's avatar
Haotian Tang committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

class QuantLlamaMLP(nn.Module):

    def __init__(
        self,
        gate_proj,
        down_proj,
        up_proj,
    ):
        super().__init__()
        self.register_buffer('gate_proj_qweight', gate_proj.qweight)
        self.register_buffer('gate_proj_scales', gate_proj.scales)
        self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
        self.register_buffer('up_proj_qweight', up_proj.qweight)
        self.register_buffer('up_proj_scales', up_proj.scales)
        self.register_buffer('up_proj_qzeros', up_proj.qzeros)

        self.in_features = gate_proj.in_features
        self.intermediate_size = gate_proj.out_features
        self.out_features = down_proj.out_features
        self.w_bit = gate_proj.w_bit
        self.down_proj = down_proj

    def forward(self, x):
Casper Hansen's avatar
Casper Hansen committed
57
        out_shape = x.shape[:-1] + (self.intermediate_size,)
Haotian Tang's avatar
Haotian Tang committed
58
        x = x.reshape(-1, x.shape[-1])
Casper Hansen's avatar
Casper Hansen committed
59
60
61
62
63
64
        gate_output = awq_inference_engine.gemv_forward_cuda(
            x,
            self.gate_proj_qweight,
            self.gate_proj_scales,
            self.gate_proj_qzeros,
            self.down_proj.group_size,
Haotian Tang's avatar
Haotian Tang committed
65
        )
Casper Hansen's avatar
Casper Hansen committed
66
67
68
69
70
71
        up_output = awq_inference_engine.gemv_forward_cuda(
            x,
            self.up_proj_qweight,
            self.up_proj_scales,
            self.up_proj_qzeros,
            self.down_proj.group_size,
Haotian Tang's avatar
Haotian Tang committed
72
        )
Casper Hansen's avatar
Casper Hansen committed
73
74
75
76
77
        x = F.silu(gate_output) * up_output
        x = x.reshape(out_shape)
        x = self.down_proj(x)

        return x