utils.py 15.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
"""Utilities for selecting and loading models."""
import contextlib
5
6
7
import inspect
import warnings
from contextlib import contextmanager
8
from dataclasses import dataclass, field
9
from typing import Optional
10

zhuwenwen's avatar
zhuwenwen committed
11
import os
12
13
import torch
from torch import nn
14
from typing_extensions import assert_never
15

16
from vllm.attention import Attention
17
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
18
from vllm.logger import init_logger
19
from vllm.model_executor.layers.linear import QKVCrossParallelLinear
20
from vllm.model_executor.layers.quantization.base_config import (
21
    QuantizationConfig, QuantizeMethodBase)
22

23
24
25
26
27
from vllm.model_executor.models.adapters import (
    as_embedding_model, as_reward_model, as_seq_cls_model,
    try_create_mm_pooling_model_cls)
from vllm.model_executor.models.interfaces import (SupportsQuant,
                                                   supports_multimodal)
28

29
from vllm.utils import is_pin_memory_available
30
import vllm.envs as envs
31

32
logger = init_logger(__name__)
guanyu1's avatar
guanyu1 committed
33
from ..models.adapters_custom.adapters_classify import (
guanyu1's avatar
test2  
guanyu1 committed
34
35
    new_hy_05b_dense_official_classification,
    hy_2b_dense_classification_official_hf_multihead_full_mask
guanyu1's avatar
guanyu1 committed
36
)
guanyu1's avatar
test2  
guanyu1 committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def _hunyuan_classify_selector(model_cls: type[nn.Module],
                               hf_config) -> type[nn.Module]:
    """Select appropriate HunYuan seq-cls adapter by hidden size.

    For certain HunYuan configs (e.g., hidden_size=2560), we use the
    specialized two-layer classification head defined in
    ``as_hunyuan_seq_cls_model``. For other sizes, fall back to the
    generic ``as_seq_cls_model``.
    """
    hidden_size=hf_config.hidden_size
    if hidden_size ==2560:  # extend set as needed for other variants
        return hy_2b_dense_classification_official_hf_multihead_full_mask(model_cls)
    elif hidden_size ==1280:
        return new_hy_05b_dense_official_classification(model_cls)
    else:
        return None


guanyu1's avatar
guanyu1 committed
55
CLASSIFY_CLASSIFY_REGISTRY = {
guanyu1's avatar
test2  
guanyu1 committed
56
57
    # Uses a selector that decides adapter by hidden_size
    "HunYuanForCausalLM": _hunyuan_classify_selector,
guanyu1's avatar
guanyu1 committed
58
}
59
60
61
62
63
64
65
66
67
68

@contextlib.contextmanager
def set_default_torch_dtype(dtype: torch.dtype):
    """Sets the default torch dtype to the given dtype."""
    old_dtype = torch.get_default_dtype()
    torch.set_default_dtype(dtype)
    yield
    torch.set_default_dtype(old_dtype)


69
70
71
72
73
def initialize_model(
    vllm_config: VllmConfig,
    *,
    prefix: str = "",
    model_class: Optional[type[nn.Module]] = None,
74
    model_config: Optional[ModelConfig] = None,
75
76
) -> nn.Module:
    """Initialize a model with the given configurations."""
77
78
    if model_config is None:
        model_config = vllm_config.model_config
79
80
81
82
83
84
85
86
87
88
    if model_class is None:
        model_class, _ = get_model_architecture(model_config)

    if vllm_config.quant_config is not None:
        configure_quant_config(vllm_config.quant_config, model_class)

    signatures = inspect.signature(model_class.__init__)
    all_params = [param.name for param in signatures.parameters.values()]
    if "vllm_config" in all_params and "prefix" in all_params:
        # new-style model class
89
90
91
        with set_current_vllm_config(vllm_config,
                                     check_compile=True,
                                     prefix=prefix):
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
            return model_class(vllm_config=vllm_config, prefix=prefix)

    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
           "input arguments. Possibly you have an old-style model class"
           " registered from out of tree and it is used for new vLLM version. "
           "Check https://docs.vllm.ai/en/latest/design/arch_overview.html "
           "for the design and update the model class accordingly.")
    warnings.warn(msg, DeprecationWarning, stacklevel=2)

    logger.warning(
        "Trying to guess the arguments for old-style model class %s",
        model_class,
    )
    # try to be compatible with old-style model class
    kwargs = {}
    if "prefix" in all_params:
        kwargs["prefix"] = prefix
    if "config" in all_params:
        kwargs["config"] = model_config.hf_config
    if "cache_config" in all_params:
        kwargs["cache_config"] = vllm_config.cache_config
    if "quant_config" in all_params:
        kwargs["quant_config"] = vllm_config.quant_config
    if "lora_config" in all_params:
        kwargs["lora_config"] = vllm_config.lora_config
    if "scheduler_config" in all_params:
        kwargs["scheduler_config"] = vllm_config.scheduler_config
119
120
121
    with set_current_vllm_config(vllm_config,
                                 check_compile=True,
                                 prefix=prefix):
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
        return model_class(**kwargs)


def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
                                  target_device: torch.device) -> None:
    for _, module in model.named_modules():
        if isinstance(module, QKVCrossParallelLinear):
            # NOTE(Isotr0py): special case for cross QKV layer because
            # q and kv proj aren't registered as submodules intentionally
            module.process_weights_after_loading()
            continue
        quant_method = getattr(module, "quant_method", None)
        if isinstance(quant_method, QuantizeMethodBase):
            # When quant methods need to process weights after loading
            # (for repacking, quantizing, etc), they expect parameters
            # to be on the global target device. This scope is for the
            # case where cpu offloading is used, where we will move the
            # parameters onto device for processing and back off after.
            with device_loading_context(module, target_device):
                quant_method.process_weights_after_loading(module)

    # Currently only used by MLA.
    # NOTE: This intentionally happens after other modules so we can easily
    # decompress the weights for MLA.
    for _, module in model.named_modules():
        if isinstance(module, Attention) and \
            hasattr(module, "process_weights_after_loading"):
            # TODO(lucas): see if there is a way to unify the signatures
            # of process_weights_after_loading
            module.process_weights_after_loading(model_config.dtype)


@contextmanager
def device_loading_context(module: torch.nn.Module,
                           target_device: torch.device):
    if target_device.type == "cpu":
        # If target is CPU, no need to move anything
        yield module
        return

162
    original_device_states: dict[str, torch.device] = {}
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

    # Store original device states and move parameters to GPU if they're on CPU
    for name, p in module.named_parameters():
        if p.device.type == "cpu":
            original_device_states[name] = p.device
            p.data = p.data.to(target_device)
        # Parameters already on target device are not touched

    try:
        yield module

    finally:
        # Restore parameters to their original devices, ignoring new parameters
        pin_memory = is_pin_memory_available()
        for name, p in module.named_parameters():
            if name in original_device_states:
                original_device: torch.device = original_device_states[name]
                if original_device.type == "cpu":
                    # `torch.empty_like` does not support `pin_memory` argument
                    cpu_data = torch.empty_strided(
                        size=p.data.size(),
                        stride=p.data.stride(),
                        dtype=p.data.dtype,
                        layout=p.data.layout,
                        device="cpu",
                        pin_memory=pin_memory,
                    )
                    cpu_data.copy_(p.data)
                    p.data = cpu_data
                else:
                    p.data = p.data.to(original_device)
        # New parameters or parameters already on target device are untouched


197
_MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]()
198
199
200
201
"""Caches the outputs of `_get_model_architecture`."""


def _get_model_architecture(
202
        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
203
    architectures = getattr(model_config.hf_config, "architectures", [])
zhuwenwen's avatar
zhuwenwen committed
204
    visions = getattr(model_config.hf_config, "visual", []) or getattr(model_config.hf_config, "vision_config", [])
205
206
    # TODO: 'Qwen2_5_VLForConditionalGeneration', 
    support_nn_architectures = ['LlamaForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2MoeForCausalLM', 'Qwen3ForCausalLM', 'Qwen3MoeForCausalLM',
zhuwenwen's avatar
zhuwenwen committed
207
                                'ChatGLMModel', 'Glm4ForCausalLM', 'ChatGLMForConditionalGeneration', 'BaichuanForCausalLM', 'BloomForCausalLM', 'TeleChat2ForCausalLM', 'MixtralForCausalLM', 'FalconForCausalLM',
zhuwenwen's avatar
zhuwenwen committed
208
                                'MedusaModel', 'MLPSpeculatorPreTrainedModel', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DeepSeekMTPModel']  
209
    if any(arch in architectures for arch in support_nn_architectures): 
210
211
212
213
214
215
216
217
218
219
        if not envs.VLLM_USE_NN:
            if os.getenv('LLAMA_NN') != '0': 
                if (architectures == ['QWenLMHeadModel'] or architectures == ['ChatGLMModel'] ) and visions != []:
                    os.environ['LLAMA_NN'] = '0'
                else:
                    os.environ['LLAMA_NN'] = '1'
            if (architectures == ['BloomForCausalLM'] or architectures == ['FalconForCausalLM']) or os.getenv('LM_NN') == '0':
                os.environ['LM_NN'] = '0'
            else:
                os.environ['LM_NN'] = '1'
220
                
221
            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
222
223
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
224
225
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
226

227
228
229
230
            if os.getenv('GEMM_PAD') != '1': 
                os.environ['GEMM_PAD'] = '0'
            if os.getenv('FA_PAD') != '1': 
                os.environ['FA_PAD'] = '0'
231
232
233
234
235
236
237
        else:
            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
                    
238
        # awq相关配置
zhuwenwen's avatar
zhuwenwen committed
239
        try:
240
241
242
            if os.getenv('AWQ_MOE_SZ') == None:
                os.environ['AWQ_MOE_SZ'] = '1'
            if os.getenv('AWQ_PAD') == None and (torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120):
zhuwenwen's avatar
zhuwenwen committed
243
244
245
246
247
248
                os.environ['AWQ_PAD'] = '1'
        except Exception as e:
            if os.getenv('AWQ_PAD') != '0': 
                os.environ['AWQ_PAD'] = '1'
            else:
                os.environ['AWQ_PAD'] = '0'
zhuwenwen's avatar
zhuwenwen committed
249
250
    else:
        os.environ['LLAMA_NN'] = '0'
zhuwenwen's avatar
zhuwenwen committed
251
        os.environ['LM_NN'] = '0'
252
253
        os.environ['GEMM_PAD'] = '0'
        os.environ['FA_PAD'] = '0'
zhuwenwen's avatar
zhuwenwen committed
254
        os.environ['AWQ_PAD'] = '0'
255

256
257
258
259
260
261
    model_cls, arch = model_config.registry.resolve_model_cls(
        architectures,
        model_config=model_config,
    )

    if arch == model_config._get_transformers_backend_cls():
262
263
        assert model_config.model_impl != "vllm"
        if model_config.model_impl == "auto":
264
265
266
267
268
269
            logger.warning_once(
                "%s has no vLLM implementation, falling back to Transformers "
                "implementation. Some features may not be supported and "
                "performance may not be optimal.", arch)

    convert_type = model_config.convert_type
270
271
272
273
274
275
276
277
278
    if convert_type != "none" and supports_multimodal(model_cls):
        logger.debug_once("Detected conversion of Multi Modal model.")
        converted = try_create_mm_pooling_model_cls(model_cls)
        if converted is not None:
            logger.debug_once("Creating wrapper class to forward pooler.")
            return converted, arch
        else:
            logger.debug_once("Attempting direct conversion.")

279
280
281
282
    if convert_type == "none":
        pass
    elif convert_type == "embed":
        logger.debug_once("Converting to embedding model.")
283
        model_cls = as_embedding_model(model_cls)
284
    elif convert_type == "classify":
guanyu1's avatar
guanyu1 committed
285
        if arch in CLASSIFY_CLASSIFY_REGISTRY.keys():
guanyu1's avatar
test2  
guanyu1 committed
286
287
            selector = CLASSIFY_CLASSIFY_REGISTRY[arch]
            model_cls = selector(model_cls,model_config.hf_config)
guanyu1's avatar
guanyu1 committed
288
289
290
        else:
            logger.debug_once("Converting to sequence classification model.")
            model_cls = as_seq_cls_model(model_cls)
291
292
    elif convert_type == "reward":
        logger.debug_once("Converting to reward model.")
293
        model_cls = as_reward_model(model_cls)
294
295
    else:
        assert_never(convert_type)
296
297

    return model_cls, arch
298
299


300
301
def get_model_architecture(
        model_config: ModelConfig) -> tuple[type[nn.Module], str]:
302
303
304
305
306
307
308
309
    key = hash((
        model_config.model,
        model_config.convert_type,
        model_config.runner_type,
        model_config.trust_remote_code,
        model_config.model_impl,
        tuple(getattr(model_config.hf_config, "architectures", [])),
    ))
310
311
312
313
314
315
316
317
    if key in _MODEL_ARCH_BY_HASH:
        return _MODEL_ARCH_BY_HASH[key]

    model_arch = _get_model_architecture(model_config)
    _MODEL_ARCH_BY_HASH[key] = model_arch
    return model_arch


318
319
320
321
def get_model_cls(model_config: ModelConfig) -> type[nn.Module]:
    return get_model_architecture(model_config)[0]


322
323
def get_architecture_class_name(model_config: ModelConfig) -> str:
    return get_model_architecture(model_config)[1]
324
325
326
327
328
329
330
331
332


@dataclass
class ParamMapping:
    """
    A class to handle parameter mapping for model weight loading.
    It creates a bidirectional mapping between packed parameters and their 
    constituent parts.
    """
333
334
    packed_mapping: dict[str, list[str]]
    inverse_packed_mapping: dict[str, tuple[str,
335
336
337
338
339
340
341
342
343
344
345
346
                                            int]] = field(default_factory=dict)

    def __post_init__(self):
        for packed_name, sub_params in self.packed_mapping.items():
            # Skip self-contained cases (e.g., {"W_pack": ["W_pack"]})
            if len(sub_params) == 1 and sub_params[0] == packed_name:
                continue
            for index, param_name in enumerate(sub_params):
                self.inverse_packed_mapping[param_name] = (
                    packed_name,
                    index,
                )
347
348

    def get_sub_modules(self,
349
                        module_name: str) -> Optional[tuple[str, list[str]]]:
350
351
352
353
        for key, value in self.packed_mapping.items():
            if module_name.endswith(key):
                return key, value
        return None
354
355
356


def configure_quant_config(quant_config: QuantizationConfig,
357
                           model_class: type[nn.Module]):
358
359
360
361
362
363
    """
    Pass packed_modules_mapping by reference to quant_config so that
    quant_config can properly match fused modules

    Note that model attributes are passed by reference to quant_config,
    enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
364
365
366

    Once the `SupportsQuant` mixin has been added to all models, this
    function can be removed
367
    """
368
369
370
371
372
373
374
375
376
    if not issubclass(model_class, SupportsQuant):
        hf_to_vllm_mapper = getattr(model_class, "hf_to_vllm_mapper", None)
        packed_mapping = getattr(model_class, "packed_modules_mapping", None)

        # pass mappings by reference to quant_config
        if hf_to_vllm_mapper is not None:
            quant_config.apply_vllm_mapper(hf_to_vllm_mapper)
        if packed_mapping is not None:
            quant_config.packed_modules_mapping = packed_mapping