"tests/vscode:/vscode.git/clone" did not exist on "ee484b3f4b0c061d2612ea7c0cb40b44baf680c0"
opt.py 15.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
Woosuk Kwon's avatar
Woosuk Kwon committed
6
# Copyright 2023 The vLLM team.
7
8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
# reserved.
9
10
11
12
13
14
15
16
17
18
19
20
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Woosuk Kwon's avatar
Woosuk Kwon committed
21
"""Inference-only OPT model compatible with HuggingFace weights."""
22

23
from collections.abc import Iterable
24
from itertools import islice
25
from typing import Optional, Union
Woosuk Kwon's avatar
Woosuk Kwon committed
26

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
27
28
29
30
import torch
from torch import nn
from transformers import OPTConfig

31
from vllm.attention import Attention
32
from vllm.compilation.decorators import support_torch_compile
33
from vllm.config import CacheConfig, VllmConfig
34
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
Woosuk Kwon's avatar
Woosuk Kwon committed
35
from vllm.model_executor.layers.activation import get_act_fn
36
37
38
39
40
41
from vllm.model_executor.layers.linear import (
    ColumnParallelLinear,
    QKVParallelLinear,
    ReplicatedLinear,
    RowParallelLinear,
)
42
from vllm.model_executor.layers.logits_processor import LogitsProcessor
43
from vllm.model_executor.layers.quantization import QuantizationConfig
44
from vllm.model_executor.layers.vocab_parallel_embedding import (
45
46
47
    ParallelLMHead,
    VocabParallelEmbedding,
)
48
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
49
from vllm.sequence import IntermediateTensors
Woosuk Kwon's avatar
Woosuk Kwon committed
50

51
from .interfaces import SupportsLoRA, SupportsPP
52
53
54
55
56
57
58
59
from .utils import (
    AutoWeightsLoader,
    WeightsMapper,
    is_pp_missing_parameter,
    make_empty_intermediate_tensors_factory,
    make_layers,
    maybe_prefix,
)
60

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
61
62
63

class OPTLearnedPositionalEmbedding(nn.Embedding):
    def __init__(self, num_embeddings: int, embedding_dim: int):
64
65
66
        # OPT is set up so that if padding_idx is specified then offset the
        # embedding ids by 2 and adjust num_embeddings appropriately. Other
        # models don't have this hack
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
67
68
69
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

70
    def forward(self, positions: torch.Tensor):
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
71
72
73
74
75
76
77
78
79
        return super().forward(positions + self.offset)


class OPTAttention(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        bias: bool = True,
80
        cache_config: Optional[CacheConfig] = None,
81
        quant_config: Optional[QuantizationConfig] = None,
82
        prefix: str = "",
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
83
84
85
    ) -> None:
        super().__init__()
        self.embed_dim = embed_dim
86
        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
Zhuohan Li's avatar
Zhuohan Li committed
87
88
89
90
        total_num_heads = num_heads
        assert num_heads % tensor_model_parallel_world_size == 0
        self.num_heads = total_num_heads // tensor_model_parallel_world_size
        self.head_dim = embed_dim // total_num_heads
91
        self.scaling = self.head_dim**-0.5
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
92

93
        self.qkv_proj = QKVParallelLinear(
94
            embed_dim,
95
96
            self.head_dim,
            total_num_heads,
97
            bias=bias,
98
            quant_config=quant_config,
99
            prefix=f"{prefix}.qkv_proj",
100
101
102
103
104
        )
        self.out_proj = RowParallelLinear(
            embed_dim,
            embed_dim,
            bias=bias,
105
            quant_config=quant_config,
106
            prefix=f"{prefix}.out_proj",
107
        )
108
109
110
111
112
113
114
115
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
            scale=self.scaling,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.attn",
        )
Woosuk Kwon's avatar
Woosuk Kwon committed
116
117
118
119
120

    def forward(
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
121
        qkv, _ = self.qkv_proj(hidden_states)
Woosuk Kwon's avatar
Woosuk Kwon committed
122
        q, k, v = qkv.chunk(chunks=3, dim=-1)
123
        attn_output = self.attn(q, k, v)
Zhuohan Li's avatar
Zhuohan Li committed
124
        output, _ = self.out_proj(attn_output)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
125
126
        return output

Woosuk Kwon's avatar
Woosuk Kwon committed
127

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
128
class OPTDecoderLayer(nn.Module):
129
130
131
    def __init__(
        self,
        config: OPTConfig,
132
        cache_config: Optional[CacheConfig] = None,
133
        quant_config: Optional[QuantizationConfig] = None,
134
        prefix: str = "",
135
    ):
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
136
        super().__init__()
Zhuohan Li's avatar
Zhuohan Li committed
137
        self.config = config
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
138
139
140
141
142
        self.embed_dim = config.hidden_size
        self.self_attn = OPTAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            bias=config.enable_bias,
143
            cache_config=cache_config,
144
            quant_config=quant_config,
145
            prefix=f"{prefix}.self_attn",
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
146
147
148
149
        )
        self.do_layer_norm_before = config.do_layer_norm_before

        self.self_attn_layer_norm = nn.LayerNorm(
150
151
            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
        )
152
153
154
155
        self.fc1 = ColumnParallelLinear(
            self.embed_dim,
            config.ffn_dim,
            bias=config.enable_bias,
156
            quant_config=quant_config,
157
            prefix=f"{prefix}.fc1",
158
        )
159
        self.activation_fn = get_act_fn(config.activation_function)
160
161
162
163
        self.fc2 = RowParallelLinear(
            config.ffn_dim,
            self.embed_dim,
            bias=config.enable_bias,
164
            quant_config=quant_config,
165
            prefix=f"{prefix}.fc2",
166
        )
Zhuohan Li's avatar
Zhuohan Li committed
167
        self.final_layer_norm = nn.LayerNorm(
168
169
            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
170

Woosuk Kwon's avatar
Woosuk Kwon committed
171
172
173
174
    def forward(
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor:
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
175
176
177
178
179
        # Self Attention
        residual = hidden_states
        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)
180
        hidden_states = self.self_attn(hidden_states=hidden_states)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
181
182
183
184
185
186
187
188
189
190
        hidden_states = residual + hidden_states
        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)

        # Fully Connected
        residual = hidden_states
        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)
Zhuohan Li's avatar
Zhuohan Li committed
191
        hidden_states, _ = self.fc1(hidden_states)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
192
        hidden_states = self.activation_fn(hidden_states)
Zhuohan Li's avatar
Zhuohan Li committed
193
        hidden_states, _ = self.fc2(hidden_states)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
194
195
196
197
198
199
200
        hidden_states = residual + hidden_states
        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
            hidden_states = self.final_layer_norm(hidden_states)
        return hidden_states


Zhuohan Li's avatar
Zhuohan Li committed
201
class OPTDecoder(nn.Module):
202
203
204
    def __init__(
        self,
        config: OPTConfig,
205
        cache_config: Optional[CacheConfig] = None,
206
        quant_config: Optional[QuantizationConfig] = None,
207
        prefix: str = "",
208
    ):
Zhuohan Li's avatar
Zhuohan Li committed
209
210
        super().__init__()
        self.config = config
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
211
212
213
        self.max_target_positions = config.max_position_embeddings
        self.vocab_size = config.vocab_size

214
215
216
        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
            config.word_embed_proj_dim,
217
        )
Zhuohan Li's avatar
Zhuohan Li committed
218
219
        # Positional embeddings are replicated (not sharded).
        self.embed_positions = OPTLearnedPositionalEmbedding(
220
221
            config.max_position_embeddings, config.hidden_size
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
222

Zhuohan Li's avatar
Zhuohan Li committed
223
        # Project out & in will be replicated if they exist.
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
224
        if config.word_embed_proj_dim != config.hidden_size:
225
226
227
228
229
230
231
            self.project_out = ReplicatedLinear(
                config.hidden_size,
                config.word_embed_proj_dim,
                bias=False,
                quant_config=quant_config,
                prefix=f"{prefix}.project_out",
            )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
232
233
234
235
        else:
            self.project_out = None

        if config.word_embed_proj_dim != config.hidden_size:
236
237
238
239
240
241
242
            self.project_in = ReplicatedLinear(
                config.word_embed_proj_dim,
                config.hidden_size,
                bias=False,
                quant_config=quant_config,
                prefix=f"{prefix}.project_in",
            )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
243
244
245
        else:
            self.project_in = None

246
247
248
        # Note that the only purpose of `config._remove_final_layer_norm` is to
        # keep backward compatibility with checkpoints that have been fine-tuned
        # before transformers v4.20.1
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
249
250
251
        # see https://github.com/facebookresearch/metaseq/pull/164
        if config.do_layer_norm_before and not config._remove_final_layer_norm:
            self.final_layer_norm = nn.LayerNorm(
252
                config.hidden_size,
253
254
                elementwise_affine=config.layer_norm_elementwise_affine,
            )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
255
256
257
        else:
            self.final_layer_norm = None

258
259
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.num_hidden_layers,
260
            lambda prefix: OPTDecoderLayer(
261
262
263
264
                config, cache_config, quant_config, prefix=prefix
            ),
            prefix=f"{prefix}.layers",
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
265

266
267
268
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
269
270
    def forward(
        self,
271
272
        input_ids: torch.Tensor,
        positions: torch.Tensor,
273
        intermediate_tensors: Optional[IntermediateTensors],
274
        inputs_embeds: Optional[torch.Tensor] = None,
275
276
277
278
279
280
281
282
283
284
285
286
    ) -> Union[torch.Tensor, IntermediateTensors]:
        if get_pp_group().is_first_rank:
            if inputs_embeds is None:
                inputs_embeds = self.get_input_embeddings(input_ids)
            pos_embeds = self.embed_positions(positions)
            if self.project_in is not None:
                inputs_embeds, _ = self.project_in(inputs_embeds)
            hidden_states = inputs_embeds + pos_embeds
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]

287
        for layer in islice(self.layers, self.start_layer, self.end_layer):
288
            hidden_states = layer(hidden_states)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
289

290
291
        if not get_pp_group().is_last_rank:
            return IntermediateTensors({"hidden_states": hidden_states})
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
292
293
294
        if self.final_layer_norm is not None:
            hidden_states = self.final_layer_norm(hidden_states)
        if self.project_out is not None:
295
            hidden_states, _ = self.project_out(hidden_states)
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
296
297
298
        return hidden_states


299
@support_torch_compile
Zhuohan Li's avatar
Zhuohan Li committed
300
class OPTModel(nn.Module):
301
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
Zhuohan Li's avatar
Zhuohan Li committed
302
        super().__init__()
303
304
305
306
307

        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config

308
309
310
311
312
313
        self.decoder = OPTDecoder(
            config, cache_config, quant_config, prefix=f"{prefix}.decoder"
        )
        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
            ["hidden_states"], config.hidden_size
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
314

315
316
317
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.decoder.get_input_embeddings(input_ids)

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
318
319
    def forward(
        self,
320
321
        input_ids: torch.Tensor,
        positions: torch.Tensor,
322
        intermediate_tensors: Optional[IntermediateTensors],
323
        inputs_embeds: Optional[torch.Tensor] = None,
324
    ) -> Union[torch.Tensor, IntermediateTensors]:
325
326
327
        return self.decoder(
            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
328

329
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
330
331
332
333
334
335
336
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
337
        loaded_params: set[str] = set()
338
        for name, loaded_weight in weights:
339
            for param_name, weight_name, shard_id in stacked_params_mapping:
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
359
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
360
361
362
363
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
364

365
class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
366
367
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
368
369
    }

370
371
372
373
374
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "decoder.": "model.decoder.",
        }
    )
375

376
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
377
378
379
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
Zhuohan Li's avatar
Zhuohan Li committed
380
        self.config = config
381
        self.quant_config = quant_config
382
383
384
        self.model = OPTModel(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
        )
385
386
387
        if self.config.tie_word_embeddings:
            self.lm_head = self.model.decoder.embed_tokens
        else:
388
389
390
391
392
            self.lm_head = ParallelLMHead(
                config.vocab_size,
                config.word_embed_proj_dim,
                prefix=maybe_prefix(prefix, "lm_head"),
            )
393
        self.logits_processor = LogitsProcessor(config.vocab_size)
394
        self.make_empty_intermediate_tensors = (
395
396
            self.model.make_empty_intermediate_tensors
        )
Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
397

398
399
400
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)

Woosuk Kwon's avatar
Add OPT  
Woosuk Kwon committed
401
402
    def forward(
        self,
403
404
        input_ids: torch.Tensor,
        positions: torch.Tensor,
405
        intermediate_tensors: Optional[IntermediateTensors] = None,
406
        inputs_embeds: Optional[torch.Tensor] = None,
407
    ) -> Union[torch.Tensor, IntermediateTensors]:
408
409
410
        hidden_states = self.model(
            input_ids, positions, intermediate_tensors, inputs_embeds
        )
411
412
        return hidden_states

413
414
415
416
    def compute_logits(
        self,
        hidden_states: torch.Tensor,
    ) -> Optional[torch.Tensor]:
417
        logits = self.logits_processor(self.lm_head, hidden_states)
418
419
        return logits

420
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
421
422
        loader = AutoWeightsLoader(
            self,
423
424
425
            skip_prefixes=(
                ["lm_head.weight"] if self.config.tie_word_embeddings else None
            ),
426
427
        )
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)