baichuan.py 13.8 KB
Newer Older
codethazine's avatar
codethazine committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Woosuk Kwon's avatar
Woosuk Kwon committed
20
"""Inference-only BaiChuan model compatible with HuggingFace weights."""
21
import math
22
from typing import List, Optional, Tuple
codethazine's avatar
codethazine committed
23
24
25
26
27
28

import torch
from torch import nn

from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import SiluAndMul
Woosuk Kwon's avatar
Woosuk Kwon committed
29
from vllm.model_executor.layers.attention import PagedAttention
30
31
32
33
34
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
Woosuk Kwon's avatar
Woosuk Kwon committed
35
from vllm.model_executor.layers.rotary_embedding import get_rope
codethazine's avatar
codethazine committed
36
from vllm.model_executor.layers.sampler import Sampler
37
38
from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
codethazine's avatar
codethazine committed
39
40
from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
41
42
from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
43
from vllm.sequence import SamplerOutput
codethazine's avatar
codethazine committed
44
45
46
47
48
from vllm.transformers_utils.configs.baichuan import BaiChuanConfig

KVCache = Tuple[torch.Tensor, torch.Tensor]


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
    base = torch.tensor(
        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
        dtype=torch.float32,
    )
    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
    slopes = torch.pow(base, powers)

    if closest_power_of_2 != total_num_heads:
        extra_base = torch.tensor(
            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
            dtype=torch.float32,
        )
        num_remaining_heads = min(closest_power_of_2,
                                  total_num_heads - closest_power_of_2)
        extra_powers = torch.arange(start=1,
                                    end=1 + 2 * num_remaining_heads,
                                    step=2,
                                    dtype=torch.int32)
        slopes = torch.cat(
            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
    return slopes


codethazine's avatar
codethazine committed
74
75
76
77
78
79
80
class BaiChuanMLP(nn.Module):

    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        hidden_act: str,
81
        linear_method: Optional[LinearMethodBase] = None,
codethazine's avatar
codethazine committed
82
83
    ):
        super().__init__()
84
85
        self.gate_up_proj = MergedColumnParallelLinear(
            hidden_size, [intermediate_size] * 2,
86
            bias=False,
87
88
89
90
91
            linear_method=linear_method)
        self.down_proj = RowParallelLinear(intermediate_size,
                                           hidden_size,
                                           bias=False,
                                           linear_method=linear_method)
codethazine's avatar
codethazine committed
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
        if hidden_act != "silu":
            raise ValueError(f"Unsupported activation: {hidden_act}. "
                             "Only silu is supported for now.")
        self.act_fn = SiluAndMul()

    def forward(self, x):
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
        return x


class BaiChuanAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
111
        position_embedding: str,
112
113
        rope_theta: float = 10000,
        max_position_embeddings: int = 8192,
114
        linear_method: Optional[LinearMethodBase] = None,
codethazine's avatar
codethazine committed
115
116
117
118
119
120
121
122
123
124
    ):
        super().__init__()
        self.hidden_size = hidden_size
        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
        )
        self.total_num_heads = num_heads
        assert self.total_num_heads % tensor_model_parallel_world_size == 0
        self.num_heads = (self.total_num_heads //
                          tensor_model_parallel_world_size)
        self.head_dim = hidden_size // self.total_num_heads
125
        self.postion_embedding = position_embedding
126
127
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
codethazine's avatar
codethazine committed
128
129

        # pylint: disable=invalid-name
130
        self.W_pack = QKVParallelLinear(
codethazine's avatar
codethazine committed
131
            hidden_size,
132
133
134
            self.head_dim,
            self.total_num_heads,
            self.total_num_heads,
codethazine's avatar
codethazine committed
135
            bias=False,
136
            linear_method=linear_method,
codethazine's avatar
codethazine committed
137
138
139
140
141
        )
        self.o_proj = RowParallelLinear(
            self.total_num_heads * self.head_dim,
            hidden_size,
            bias=False,
142
            linear_method=linear_method,
codethazine's avatar
codethazine committed
143
        )
144
145
146
147
148
149
150
151
152
        # Create the alibi slopes and slice them.
        if self.postion_embedding == "ALIBI":
            tp_rank = get_tensor_model_parallel_rank()
            head_start = tp_rank * self.num_heads
            head_end = (tp_rank + 1) * self.num_heads
            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
            alibi_slopes = alibi_slopes[head_start:head_end].tolist()

            scaling = self.head_dim**-0.5
Woosuk Kwon's avatar
Woosuk Kwon committed
153
154
155
156
            self.attn = PagedAttention(self.num_heads,
                                       self.head_dim,
                                       scaling,
                                       alibi_slopes=alibi_slopes)
157
        else:
Woosuk Kwon's avatar
Woosuk Kwon committed
158
            self.rotary_emb = get_rope(
159
160
                self.head_dim,
                rotary_dim=self.head_dim,
Woosuk Kwon's avatar
Woosuk Kwon committed
161
                max_position=self.max_position_embeddings,
162
                base=self.rope_theta,
Woosuk Kwon's avatar
Woosuk Kwon committed
163
164
165
166
            )
            self.scaling = self.head_dim**-0.5
            self.attn = PagedAttention(self.num_heads, self.head_dim,
                                       self.scaling)
codethazine's avatar
codethazine committed
167
168
169
170
171
172
173
174
175
176
177

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        kv_cache: KVCache,
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:
        qkv, _ = self.W_pack(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
Woosuk Kwon's avatar
Woosuk Kwon committed
178
179
        if self.postion_embedding != "ALIBI":
            q, k = self.rotary_emb(positions, q, k)
codethazine's avatar
codethazine committed
180
        k_cache, v_cache = kv_cache
Woosuk Kwon's avatar
Woosuk Kwon committed
181
182
        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
                                cache_event)
codethazine's avatar
codethazine committed
183
184
185
186
187
188
        output, _ = self.o_proj(attn_output)
        return output


class BaiChuanDecoderLayer(nn.Module):

189
190
191
192
    def __init__(self,
                 config: BaiChuanConfig,
                 position_embedding: str,
                 linear_method: Optional[LinearMethodBase] = None):
codethazine's avatar
codethazine committed
193
194
        super().__init__()
        self.hidden_size = config.hidden_size
195
196
197
        rope_theta = getattr(config, "rope_theta", 10000)
        max_position_embeddings = getattr(config, "max_position_embeddings",
                                          8192)
codethazine's avatar
codethazine committed
198
199
200
        self.self_attn = BaiChuanAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
201
            position_embedding=position_embedding,
202
203
            rope_theta=rope_theta,
            max_position_embeddings=max_position_embeddings,
204
            linear_method=linear_method,
codethazine's avatar
codethazine committed
205
206
207
208
209
        )
        self.mlp = BaiChuanMLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
210
            linear_method=linear_method,
codethazine's avatar
codethazine committed
211
212
213
214
215
216
217
218
219
220
221
222
223
        )
        self.input_layernorm = RMSNorm(config.hidden_size,
                                       eps=config.rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                eps=config.rms_norm_eps)

    def forward(
        self,
        positions: torch.Tensor,
        hidden_states: torch.Tensor,
        kv_cache: KVCache,
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
224
225
        residual: Optional[torch.Tensor],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
codethazine's avatar
codethazine committed
226
        # Self Attention
227
228
229
230
231
232
        if residual is None:
            residual = hidden_states
            hidden_states = self.input_layernorm(hidden_states)
        else:
            hidden_states, residual = self.input_layernorm(
                hidden_states, residual)
codethazine's avatar
codethazine committed
233
234
235
236
237
238
239
240
241
        hidden_states = self.self_attn(
            positions=positions,
            hidden_states=hidden_states,
            kv_cache=kv_cache,
            input_metadata=input_metadata,
            cache_event=cache_event,
        )

        # Fully Connected
242
243
        hidden_states, residual = self.post_attention_layernorm(
            hidden_states, residual)
codethazine's avatar
codethazine committed
244
        hidden_states = self.mlp(hidden_states)
245
        return hidden_states, residual
codethazine's avatar
codethazine committed
246
247
248
249


class BaiChuanModel(nn.Module):

250
251
252
253
    def __init__(self,
                 config: BaiChuanConfig,
                 position_embedding: str,
                 linear_method: Optional[LinearMethodBase] = None):
codethazine's avatar
codethazine committed
254
255
256
257
258
259
260
261
        super().__init__()
        self.config = config
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
            config.hidden_size,
262
        )
codethazine's avatar
codethazine committed
263
        self.layers = nn.ModuleList([
264
            BaiChuanDecoderLayer(config, position_embedding, linear_method)
codethazine's avatar
codethazine committed
265
266
267
268
269
270
271
272
273
274
275
276
277
            for _ in range(config.num_hidden_layers)
        ])
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
278
        residual = None
codethazine's avatar
codethazine committed
279
        for i in range(len(self.layers)):
280
            cache_event = None if cache_events is None else cache_events[i]
codethazine's avatar
codethazine committed
281
            layer = self.layers[i]
282
            hidden_states, residual = layer(
codethazine's avatar
codethazine committed
283
284
285
286
287
                positions,
                hidden_states,
                kv_caches[i],
                input_metadata,
                cache_event,
288
                residual,
codethazine's avatar
codethazine committed
289
            )
290
        hidden_states, _ = self.norm(hidden_states, residual)
codethazine's avatar
codethazine committed
291
292
293
        return hidden_states


294
class BaiChuanBaseForCausalLM(nn.Module):
codethazine's avatar
codethazine committed
295

296
297
298
299
    def __init__(self,
                 config,
                 position_embedding: str,
                 linear_method: Optional[LinearMethodBase] = None):
codethazine's avatar
codethazine committed
300
301
        super().__init__()
        self.config = config
302
303
304
        self.linear_method = linear_method
        self.model = BaiChuanModel(config, position_embedding, linear_method)
        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
codethazine's avatar
codethazine committed
305
306
307
308
309
310
311
312
313
        self.sampler = Sampler(config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
314
    ) -> SamplerOutput:
codethazine's avatar
codethazine committed
315
316
317
318
319
320
321
322
323
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
                                   input_metadata)
        return next_tokens

    def load_weights(self,
                     model_name_or_path: str,
                     cache_dir: Optional[str] = None,
Jasmond L's avatar
Jasmond L committed
324
325
                     load_format: str = "auto",
                     revision: Optional[str] = None):
326
327
328
329
330
331
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters())
codethazine's avatar
codethazine committed
332
        for name, loaded_weight in hf_model_weights_iterator(
Jasmond L's avatar
Jasmond L committed
333
                model_name_or_path, cache_dir, load_format, revision):
codethazine's avatar
codethazine committed
334
335
            if "rotary_emb.inv_freq" in name:
                continue
336
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
codethazine's avatar
codethazine committed
337
338
                if weight_name not in name:
                    continue
339
340
341
                param = params_dict[name.replace(weight_name, param_name)]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
codethazine's avatar
codethazine committed
342
                break
343
344
345
346
347
            else:
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader",
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
348
349
350
351


class BaichuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 13b

352
353
354
355
    def __init__(self,
                 config,
                 linear_method: Optional[LinearMethodBase] = None):
        super().__init__(config, "ALIBI", linear_method)
356
357
358
359


class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 7b

360
361
362
363
    def __init__(self,
                 config,
                 linear_method: Optional[LinearMethodBase] = None):
        super().__init__(config, "ROPE", linear_method)