utils.py 17.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Utilities for selecting and loading models."""
4

5
6
7
import inspect
import warnings
from contextlib import contextmanager
8
from dataclasses import dataclass, field
9

zhuwenwen's avatar
zhuwenwen committed
10
import os
11
12
import torch
from torch import nn
13
from typing_extensions import assert_never
14

15
from vllm.attention.layer import Attention, MLAAttention
16
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
17
from vllm.logger import init_logger
18
from vllm.model_executor.layers.quantization.base_config import (
19
20
21
    QuantizationConfig,
    QuantizeMethodBase,
)
22
from vllm.model_executor.models.interfaces import SupportsQuant
23
from vllm.utils.platform_utils import is_pin_memory_available
24
from vllm import envs
25

26
27
logger = init_logger(__name__)

28

29
30
31
32
def initialize_model(
    vllm_config: VllmConfig,
    *,
    prefix: str = "",
33
34
    model_class: type[nn.Module] | None = None,
    model_config: ModelConfig | None = None,
35
36
) -> nn.Module:
    """Initialize a model with the given configurations."""
37
38
    if model_config is None:
        model_config = vllm_config.model_config
39
40
41
42
43
44
45
46
47
48
    if model_class is None:
        model_class, _ = get_model_architecture(model_config)

    if vllm_config.quant_config is not None:
        configure_quant_config(vllm_config.quant_config, model_class)

    signatures = inspect.signature(model_class.__init__)
    all_params = [param.name for param in signatures.parameters.values()]
    if "vllm_config" in all_params and "prefix" in all_params:
        # new-style model class
49
        with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix):
50
51
            return model_class(vllm_config=vllm_config, prefix=prefix)

52
53
54
55
56
57
58
    msg = (
        "vLLM model class should accept `vllm_config` and `prefix` as "
        "input arguments. Possibly you have an old-style model class"
        " registered from out of tree and it is used for new vLLM version. "
        "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
        "for the design and update the model class accordingly."
    )
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    warnings.warn(msg, DeprecationWarning, stacklevel=2)

    logger.warning(
        "Trying to guess the arguments for old-style model class %s",
        model_class,
    )
    # try to be compatible with old-style model class
    kwargs = {}
    if "prefix" in all_params:
        kwargs["prefix"] = prefix
    if "config" in all_params:
        kwargs["config"] = model_config.hf_config
    if "cache_config" in all_params:
        kwargs["cache_config"] = vllm_config.cache_config
    if "quant_config" in all_params:
        kwargs["quant_config"] = vllm_config.quant_config
    if "lora_config" in all_params:
        kwargs["lora_config"] = vllm_config.lora_config
    if "scheduler_config" in all_params:
        kwargs["scheduler_config"] = vllm_config.scheduler_config
79
    with set_current_vllm_config(vllm_config, check_compile=True, prefix=prefix):
80
81
82
        return model_class(**kwargs)


83
84
85
def process_weights_after_loading(
    model: nn.Module, model_config: ModelConfig, target_device: torch.device
) -> None:
86
87
88
89
90
91
92
93
    if getattr(model, "process_weights_after_loading_already_called", False):
        # In case `process_weights_after_loading` is called multiple times
        # we'll skip it at later times
        logger.debug_once(
            "process_weights_after_loading already called for model %s", model
        )
        return

94
95
    # to avoid circular dependency
    from vllm.model_executor.model_loader.online_quantization import (
96
97
98
99
        maybe_save_metadata_and_attributes_for_weight_reloading,
    )

    maybe_save_metadata_and_attributes_for_weight_reloading(model, model_config)
100

101
102
103
104
105
106
107
108
109
110
111
    for _, module in model.named_modules():
        quant_method = getattr(module, "quant_method", None)
        if isinstance(quant_method, QuantizeMethodBase):
            # When quant methods need to process weights after loading
            # (for repacking, quantizing, etc), they expect parameters
            # to be on the global target device. This scope is for the
            # case where cpu offloading is used, where we will move the
            # parameters onto device for processing and back off after.
            with device_loading_context(module, target_device):
                quant_method.process_weights_after_loading(module)

112
113
    # Initialize post-load attention weights for both Attention and MLA.
    # NOTE: Happens after other modules so we can easily decompress weights.
114
    for _, module in model.named_modules():
115
        if isinstance(module, (Attention, MLAAttention)) and hasattr(
116
117
            module, "process_weights_after_loading"
        ):
118
119
120
121
122
123
            # TODO(lucas): see if there is a way to unify the signatures
            # of process_weights_after_loading
            module.process_weights_after_loading(model_config.dtype)


@contextmanager
124
def device_loading_context(module: torch.nn.Module, target_device: torch.device):
125
126
127
128
129
    if target_device.type == "cpu":
        # If target is CPU, no need to move anything
        yield module
        return

130
    original_device_states: dict[str, torch.device] = {}
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

    # Store original device states and move parameters to GPU if they're on CPU
    for name, p in module.named_parameters():
        if p.device.type == "cpu":
            original_device_states[name] = p.device
            p.data = p.data.to(target_device)
        # Parameters already on target device are not touched

    try:
        yield module

    finally:
        # Restore parameters to their original devices, ignoring new parameters
        pin_memory = is_pin_memory_available()
        for name, p in module.named_parameters():
            if name in original_device_states:
                original_device: torch.device = original_device_states[name]
                if original_device.type == "cpu":
                    # `torch.empty_like` does not support `pin_memory` argument
                    cpu_data = torch.empty_strided(
                        size=p.data.size(),
                        stride=p.data.stride(),
                        dtype=p.data.dtype,
                        layout=p.data.layout,
                        device="cpu",
                        pin_memory=pin_memory,
                    )
                    cpu_data.copy_(p.data)
                    p.data = cpu_data
                else:
                    p.data = p.data.to(original_device)
        # New parameters or parameters already on target device are untouched


165
_MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
166
167
168
"""Caches the outputs of `_get_model_architecture`."""


169
def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
170
    from vllm.model_executor.models.adapters import as_embedding_model, as_seq_cls_model
Cyrus Leung's avatar
Cyrus Leung committed
171

172
    architectures = getattr(model_config.hf_config, "architectures", [])
zhuwenwen's avatar
zhuwenwen committed
173
    visions = getattr(model_config.hf_config, "visual", []) or getattr(model_config.hf_config, "vision_config", [])
174
175
    # TODO: 'Qwen2_5_VLForConditionalGeneration', 
    support_nn_architectures = ['LlamaForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2MoeForCausalLM', 'Qwen3ForCausalLM', 'Qwen3MoeForCausalLM',
zhuwenwen's avatar
zhuwenwen committed
176
                                'ChatGLMModel', 'Glm4ForCausalLM', 'ChatGLMForConditionalGeneration', 'BaichuanForCausalLM', 'BloomForCausalLM', 'TeleChat2ForCausalLM', 'MixtralForCausalLM', 'FalconForCausalLM',
zhuwenwen's avatar
zhuwenwen committed
177
                                'MedusaModel', 'MLPSpeculatorPreTrainedModel', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DeepSeekMTPModel']  
178
    if any(arch in architectures for arch in support_nn_architectures): 
179
180
181
182
183
184
185
186
187
188
        if not envs.VLLM_USE_NN:
            if os.getenv('LLAMA_NN') != '0': 
                if (architectures == ['QWenLMHeadModel'] or architectures == ['ChatGLMModel'] ) and visions != []:
                    os.environ['LLAMA_NN'] = '0'
                else:
                    os.environ['LLAMA_NN'] = '1'
            if (architectures == ['BloomForCausalLM'] or architectures == ['FalconForCausalLM']) or os.getenv('LM_NN') == '0':
                os.environ['LM_NN'] = '0'
            else:
                os.environ['LM_NN'] = '1'
189
                
190
            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
191
192
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
193
194
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
195
196
                if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
                    os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
197
198
199
                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
200
201
202
203
204
                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
                #     if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
                #         os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
205
            else:
206
207
                if not envs.is_set("VLLM_USE_PD_SPLIT"):
                    os.environ['VLLM_USE_PD_SPLIT'] = '1'
208
                if architectures in [['Qwen3MoeForCausalLM']]:
209
210
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
211
212
                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
213
214
215
216
217
218
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
                        os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
219
220
221
                # if architectures in [['Qwen3ForCausalLM']]:
                #     if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                #         os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '0'
222

223
224
225
226
            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):
                    os.environ['VLLM_USE_V32_ENCODE'] = '1'
                    
227
228
229
230
            if os.getenv('GEMM_PAD') != '1': 
                os.environ['GEMM_PAD'] = '0'
            if os.getenv('FA_PAD') != '1': 
                os.environ['FA_PAD'] = '0'
231
232
233
234
235
236
        else:
            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
237
238
                if not envs.is_set("VLLM_USE_FUSED_FILL_RMS_CAT"):
                    os.environ['VLLM_USE_FUSED_FILL_RMS_CAT'] = '1'
239
240
241
                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
242
243
244
245
246
                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
                #     if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
                #         os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
247
            else:
248
249
                if not envs.is_set("VLLM_USE_PD_SPLIT"):
                    os.environ['VLLM_USE_PD_SPLIT'] = '1'
250
                if architectures in [['Qwen3MoeForCausalLM']]:
251
252
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
253
254
                    if not envs.is_set("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN"):
                        os.environ['VLLM_USE_LIGHTOP_FILL_MOE_ALIGN'] = '1'
255
256
257
258
259
260
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
                        os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
261
262
263
264
            
            if architectures in [['DeepseekV32ForCausalLM']]:
                if not envs.is_set("VLLM_USE_V32_ENCODE"):
                    os.environ['VLLM_USE_V32_ENCODE'] = '1'
265
                    
266
        # awq相关配置
zhuwenwen's avatar
zhuwenwen committed
267
        try:
268
            if os.getenv('AWQ_PAD') == None and (torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120):
zhuwenwen's avatar
zhuwenwen committed
269
270
271
272
273
274
                os.environ['AWQ_PAD'] = '1'
        except Exception as e:
            if os.getenv('AWQ_PAD') != '0': 
                os.environ['AWQ_PAD'] = '1'
            else:
                os.environ['AWQ_PAD'] = '0'
zhuwenwen's avatar
zhuwenwen committed
275
276
    else:
        os.environ['LLAMA_NN'] = '0'
zhuwenwen's avatar
zhuwenwen committed
277
        os.environ['LM_NN'] = '0'
278
279
        os.environ['GEMM_PAD'] = '0'
        os.environ['FA_PAD'] = '0'
zhuwenwen's avatar
zhuwenwen committed
280
        os.environ['AWQ_PAD'] = '0'
281

282
283
284
285
286
287
    model_cls, arch = model_config.registry.resolve_model_cls(
        architectures,
        model_config=model_config,
    )

    if arch == model_config._get_transformers_backend_cls():
288
289
        assert model_config.model_impl != "vllm"
        if model_config.model_impl == "auto":
290
291
292
            logger.warning_once(
                "%s has no vLLM implementation, falling back to Transformers "
                "implementation. Some features may not be supported and "
293
294
295
                "performance may not be optimal.",
                arch,
            )
296
297
298
299
300
301

    convert_type = model_config.convert_type
    if convert_type == "none":
        pass
    elif convert_type == "embed":
        logger.debug_once("Converting to embedding model.")
302
        model_cls = as_embedding_model(model_cls)
303
304
    elif convert_type == "classify":
        logger.debug_once("Converting to sequence classification model.")
305
        model_cls = as_seq_cls_model(model_cls)
306
307
    else:
        assert_never(convert_type)
308
309

    return model_cls, arch
310
311


312
313
314
315
316
317
318
319
def get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
    key = hash(
        (
            model_config.model,
            model_config.convert_type,
            model_config.runner_type,
            model_config.trust_remote_code,
            model_config.model_impl,
320
            model_config.quantization,
321
322
323
            tuple(getattr(model_config.hf_config, "architectures", [])),
        )
    )
324
325
326
327
328
329
330
331
    if key in _MODEL_ARCH_BY_HASH:
        return _MODEL_ARCH_BY_HASH[key]

    model_arch = _get_model_architecture(model_config)
    _MODEL_ARCH_BY_HASH[key] = model_arch
    return model_arch


332
333
334
335
def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
    return get_model_architecture(model_config)[0]


336
337
def get_architecture_class_name(model_config: ModelConfig) -> str:
    return get_model_architecture(model_config)[1]
338
339
340
341
342
343


@dataclass
class ParamMapping:
    """
    A class to handle parameter mapping for model weight loading.
344
    It creates a bidirectional mapping between packed parameters and their
345
346
    constituent parts.
    """
347

348
    packed_mapping: dict[str, list[str]]
349
    inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict)
350
351
352
353
354
355
356
357
358
359
360

    def __post_init__(self):
        for packed_name, sub_params in self.packed_mapping.items():
            # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]})
            if len(sub_params) == 1 and sub_params[0] == packed_name:
                continue
            for index, param_name in enumerate(sub_params):
                self.inverse_packed_mapping[param_name] = (
                    packed_name,
                    index,
                )
361

362
    def get_sub_modules(self, module_name: str) -> tuple[str, list[str]] | None:
363
364
365
366
        for key, value in self.packed_mapping.items():
            if module_name.endswith(key):
                return key, value
        return None
367
368


369
370
371
def configure_quant_config(
    quant_config: QuantizationConfig, model_class: type[nn.Module]
):
372
373
374
375
376
377
    """
    Pass packed_modules_mapping by reference to quant_config so that
    quant_config can properly match fused modules

    Note that model attributes are passed by reference to quant_config,
    enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
378
379
380

    Once the `SupportsQuant` mixin has been added to all models, this
    function can be removed
381
    """
382
383
384
385
386
387
388
389
390
    if not issubclass(model_class, SupportsQuant):
        hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None)
        packed_mapping = getattr(model_class, "packed_modules_mapping", None)

        # pass mappings by reference to quant_config
        if hf_to_vllm_mapper is not None:
            quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
        if packed_mapping is not None:
            quant_config.packed_modules_mapping = packed_mapping