__init__.py 39.3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
# ruff: noqa: F401
5
import ast
6
import copy
7
import hashlib
8
import inspect
9
import json
10
import os
11
import textwrap
12
from contextlib import contextmanager
13
from dataclasses import field, fields, is_dataclass, replace
14
from functools import cached_property, lru_cache
15
16
from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar,
                    Union, cast)
17

18
import regex as re
19
import torch
20
from pydantic import ConfigDict, SkipValidation
21
from pydantic.dataclasses import dataclass
22
from typing_extensions import runtime_checkable
23

24
import vllm.envs as envs
25
from vllm import version
26
from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
27
                               PrefixCachingHashAlgo)
28
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
29
                                     CUDAGraphMode, PassConfig)
30
from vllm.config.device import Device, DeviceConfig
31
from vllm.config.kv_events import KVEventsConfig
32
from vllm.config.kv_transfer import KVTransferConfig
33
from vllm.config.load import LoadConfig
34
from vllm.config.lora import LoRAConfig
35
36
37
38
39
from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode,
                               ModelConfig, ModelDType, ModelImpl,
                               RunnerOption, TaskOption, TokenizerMode,
                               iter_architecture_defaults,
                               try_match_architecture_defaults)
40
41
from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
                                    MultiModalConfig)
42
from vllm.config.observability import DetailedTraceModules, ObservabilityConfig
43
44
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                  ParallelConfig)
45
from vllm.config.pooler import PoolerConfig
46
from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
47
from vllm.config.speculative import SpeculativeConfig
48
from vllm.config.speech_to_text import SpeechToTextConfig
49
from vllm.config.structured_outputs import StructuredOutputsConfig
50
from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
Woosuk Kwon's avatar
Woosuk Kwon committed
51
from vllm.logger import init_logger
52
from vllm.multimodal import MULTIMODAL_REGISTRY
53
54
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
55

56
if TYPE_CHECKING:
57
    from _typeshed import DataclassInstance
58
    from transformers.configuration_utils import PretrainedConfig
59

60
61
62
    from vllm.model_executor.layers.quantization.base_config import (
        QuantizationConfig)
else:
63
    DataclassInstance = Any
64
    PretrainedConfig = Any
65
    QuantizationConfig = Any
66
    QuantizationMethods = Any
67
    BaseModelLoader = Any
68
    LogitsProcessor = Any
69

70
logger = init_logger(__name__)
71
DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
72

73

74
@runtime_checkable
75
76
77
78
79
80
class SupportsHash(Protocol):

    def compute_hash(self) -> str:
        ...


81
82
class SupportsMetricsInfo(Protocol):

83
    def metrics_info(self) -> dict[str, str]:
84
85
86
        ...


87
@config
88
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
89
90
class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
91
92
93
    simplifies passing around the distinct configurations in the codebase.
    """

94
95
96
    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
    model_config: ModelConfig = None  # type: ignore
97
98
99
100
101
102
103
104
105
106
107
    """Model configuration."""
    cache_config: CacheConfig = field(default_factory=CacheConfig)
    """Cache configuration."""
    parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
    """Parallel configuration."""
    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
    """Scheduler configuration."""
    device_config: DeviceConfig = field(default_factory=DeviceConfig)
    """Device configuration."""
    load_config: LoadConfig = field(default_factory=LoadConfig)
    """Load configuration."""
108
    lora_config: Optional[LoRAConfig] = None
109
110
111
    """LoRA configuration."""
    speculative_config: Optional[SpeculativeConfig] = None
    """Speculative decoding configuration."""
112
113
114
    structured_outputs_config: StructuredOutputsConfig = field(
        default_factory=StructuredOutputsConfig)
    """Structured outputs configuration."""
115
    observability_config: Optional[ObservabilityConfig] = None
116
    """Observability configuration."""
117
    quant_config: Optional[QuantizationConfig] = None
118
119
120
    """Quantization configuration."""
    compilation_config: CompilationConfig = field(
        default_factory=CompilationConfig)
121
    """`torch.compile` and cudagraph capture configuration for the model.
122

123
124
    As a shorthand, `-O<n>` can be used to directly specify the compilation
    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
125
    Currently, -O <n> and -O=<n> are supported as well but this will likely be
126
    removed in favor of clearer -O<n> syntax in the future.
127
128
129

    NOTE: level 0 is the default level without any optimization. level 1 and 2
    are for internal testing only. level 3 is the recommended level for
130
    production, also default in V1.
131
132
133
134
135
136

    You can specify the full compilation config like so:
    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    kv_transfer_config: Optional[KVTransferConfig] = None
    """The configurations for distributed KV cache transfer."""
137
    kv_events_config: Optional[KVEventsConfig] = None
138
    """The configurations for event publishing."""
139
    # some opaque config, only used to provide additional information
140
141
    # for the hash computation, mainly used for testing, debugging or out of
    # tree config registration.
142
143
144
145
    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
    """Additional config for specified platform. Different platforms may
    support different configs. Make sure the configs are valid for the platform
    you are using. Contents must be hashable."""
146
    instance_id: str = ""
147
    """The ID of the vLLM instance."""
148

149
150
151
152
153
154
155
156
157
158
159
160
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
161
        factors: list[Any] = []
162
163

        # summarize vllm config
164
        vllm_factors: list[Any] = []
165
166
        from vllm import __version__
        vllm_factors.append(__version__)
167
        vllm_factors.append(envs.VLLM_USE_V1)
168
169
        if self.model_config:
            vllm_factors.append(self.model_config.compute_hash())
170
171
        else:
            vllm_factors.append("None")
172
173
        if self.cache_config:
            vllm_factors.append(self.cache_config.compute_hash())
174
175
        else:
            vllm_factors.append("None")
176
177
        if self.parallel_config:
            vllm_factors.append(self.parallel_config.compute_hash())
178
179
        else:
            vllm_factors.append("None")
180
181
        if self.scheduler_config:
            vllm_factors.append(self.scheduler_config.compute_hash())
182
183
        else:
            vllm_factors.append("None")
184
185
        if self.device_config:
            vllm_factors.append(self.device_config.compute_hash())
186
187
        else:
            vllm_factors.append("None")
188
189
        if self.load_config:
            vllm_factors.append(self.load_config.compute_hash())
190
191
        else:
            vllm_factors.append("None")
192
193
        if self.lora_config:
            vllm_factors.append(self.lora_config.compute_hash())
194
195
196
197
198
            # LoRA creates static buffers based on max_num_batched_tokens.
            # The tensor sizes and strides get captured in the torch.compile
            # graph explicitly.
            vllm_factors.append(
                str(self.scheduler_config.max_num_batched_tokens))
199
200
        else:
            vllm_factors.append("None")
201
202
        if self.speculative_config:
            vllm_factors.append(self.speculative_config.compute_hash())
203
204
        else:
            vllm_factors.append("None")
205
206
        if self.structured_outputs_config:
            vllm_factors.append(self.structured_outputs_config.compute_hash())
207
208
        else:
            vllm_factors.append("None")
209
210
        if self.observability_config:
            vllm_factors.append(self.observability_config.compute_hash())
211
212
        else:
            vllm_factors.append("None")
213
214
215
216
        if self.quant_config:
            pass  # should be captured by model_config.quantization
        if self.compilation_config:
            vllm_factors.append(self.compilation_config.compute_hash())
217
218
        else:
            vllm_factors.append("None")
219
220
        if self.kv_transfer_config:
            vllm_factors.append(self.kv_transfer_config.compute_hash())
221
222
223
        else:
            vllm_factors.append("None")
        if self.additional_config:
224
225
226
227
228
229
230
231
            if isinstance(additional_config := self.additional_config, dict):
                additional_config_hash = hashlib.md5(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
            else:
                additional_config_hash = additional_config.compute_hash()
            vllm_factors.append(additional_config_hash)
232
233
        else:
            vllm_factors.append("None")
234
235
        factors.append(vllm_factors)

236
237
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()[:10]
238
239
        return hash_str

240
241
242
243
244
245
    def pad_for_cudagraph(self, batch_size: int) -> int:
        # if batch_size > self.compilation_config.max_capture_size,
        # it should raise an IndexError.
        # the caller should make sure the batch_size is within the range,
        # i.e., batch_size <= self.compilation_config.max_capture_size
        return self.compilation_config.bs_to_padded_graph_size[batch_size]
246

247
248
249
250
251
    @staticmethod
    def _get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        """Get the quantization config."""
252
        from vllm.platforms import current_platform
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
        if model_config.quantization is not None:
            from vllm.model_executor.model_loader.weight_utils import (
                get_quant_config)
            quant_config = get_quant_config(model_config, load_config)
            capability_tuple = current_platform.get_device_capability()

            if capability_tuple is not None:
                capability = capability_tuple.to_int()
                if capability < quant_config.get_min_capability():
                    raise ValueError(
                        f"The quantization method {model_config.quantization} "
                        "is not supported for the current GPU. Minimum "
                        f"capability: {quant_config.get_min_capability()}. "
                        f"Current capability: {capability}.")
            supported_dtypes = quant_config.get_supported_act_dtypes()
            if model_config.dtype not in supported_dtypes:
                raise ValueError(
                    f"{model_config.dtype} is not supported for quantization "
                    f"method {model_config.quantization}. Supported dtypes: "
                    f"{supported_dtypes}")
            return quant_config
        return None
275

276
277
278
279
280
281
282
283
284
285
286
    @staticmethod
    def get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        import copy

        # For some reason, the _ version of this modifies the model_config
        # object, so using deepcopy to avoid this problem.
        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
                                                   load_config)

287
288
289
290
291
292
293
294
295
    def with_hf_config(
        self,
        hf_config: PretrainedConfig,
        architectures: Optional[list[str]] = None,
    ) -> "VllmConfig":
        if architectures is not None:
            hf_config = copy.deepcopy(hf_config)
            hf_config.architectures = architectures

296
297
298
299
300
        model_config = copy.deepcopy(self.model_config)
        model_config.hf_config = hf_config

        return replace(self, model_config=model_config)

301
302
303
    def __post_init__(self):
        """Verify configs are valid & consistent with each other.
        """
304
305
306

        self.try_verify_and_update_config()

307
308
        if self.model_config is not None:
            self.model_config.verify_with_parallel_config(self.parallel_config)
309
310
            self.model_config.verify_dual_chunk_attention_config(
                self.load_config)
311

312
        self.cache_config.verify_with_parallel_config(self.parallel_config)
313

314
        if self.lora_config is not None:
315
            self.lora_config.verify_with_cache_config(self.cache_config)
316
            self.lora_config.verify_with_model_config(self.model_config)
317

318
        if self.quant_config is None and self.model_config is not None:
319
320
            self.quant_config = VllmConfig._get_quantization_config(
                self.model_config, self.load_config)
321

322
        from vllm.platforms import current_platform
323
        if self.model_config is not None and \
324
325
326
            self.scheduler_config.chunked_prefill_enabled and \
            self.model_config.dtype == torch.float32 and \
            current_platform.get_device_capability() == (7, 5):
327
            logger.warning_once(
328
329
330
331
                "Turing devices tensor cores do not support float32 matmul. "
                "To workaround this limitation, vLLM will set 'ieee' input "
                "precision for chunked prefill triton kernels.")

332
333
334
335
336
337
338
339
340
341
342
        # If the user does not explicitly set a compilation level, then
        # we use the default level. The default level depends on other
        # settings (see the below code).
        if self.compilation_config.level is None:
            if envs.VLLM_USE_V1:
                if (self.model_config is not None
                        and not self.model_config.enforce_eager):
                    self.compilation_config.level = CompilationLevel.PIECEWISE
                else:
                    self.compilation_config.level = \
                            CompilationLevel.NO_COMPILATION
343

344
345
346
347
348
            else:
                # NB: Passing both --enforce-eager and a compilation level
                # in V0 means the compilation level wins out.
                self.compilation_config.level = CompilationLevel.NO_COMPILATION

349
350
351
352
353
        # async tp is built on top of sequence parallelism
        # and requires it to be enabled.
        if self.compilation_config.pass_config.enable_async_tp:
            self.compilation_config.pass_config.enable_sequence_parallelism = \
                True
354
355
        if self.compilation_config.pass_config.enable_sequence_parallelism:
            self.compilation_config.custom_ops.append("+rms_norm")
356

357
        if current_platform.support_static_graph_mode():
358
359
360
361
362
            # if cudagraph_mode is not explicitly set by users, set default
            # value
            if self.compilation_config.cudagraph_mode is None:
                if envs.VLLM_USE_V1 and self.compilation_config.level \
                    == CompilationLevel.PIECEWISE:
363
                    # default to full and piecewise for most models
364
                    self.compilation_config.cudagraph_mode = \
365
366
367
368
369
370
371
                        CUDAGraphMode.FULL_AND_PIECEWISE

                    # pooling model does not support full cudagraphs
                    if self.model_config is not None and \
                        self.model_config.pooler_config is not None:
                        self.compilation_config.cudagraph_mode = \
                            CUDAGraphMode.PIECEWISE
372
373
                else:
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
374

375
376
377
378
379
380
381
382
383
384
385
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and \
                    self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            elif envs.VLLM_USE_V1:
                self.compilation_config.cudagraph_num_of_warmups = 1

            self._set_cudagraph_sizes()
        else:
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
386

387
        if self.cache_config.cpu_offload_gb > 0 and \
388
389
            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
                and not envs.VLLM_USE_V1:
390
            logger.warning(
391
                "CPU offload is not supported with `torch.compile` in v0 yet."
392
393
394
                " Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
        if self.cache_config.kv_sharing_fast_prefill:
            if not envs.VLLM_USE_V1:
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not supported "
                    "in V0 currently.")

            if self.speculative_config is not None and \
                self.speculative_config.use_eagle():
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not "
                    "compatible with EAGLE as EAGLE requires correct logits "
                    "for all tokens while fast prefill gives incorrect logits "
                    "for prompt tokens.")

            logger.warning_once(
                "--kv-sharing-fast-prefill requires changes on model side for "
                "correctness and to realize prefill savings. ")

413
414
415
416
417
418
        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                and self.compilation_config.level
                != CompilationLevel.NO_COMPILATION):
            logger.warning(
                "LoRA for V0 is not supported with `torch.compile` yet. "
                "Disabling `torch.compile`.")
419
420
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

421
422
        disable_chunked_prefill_reasons: list[str] = []

423
424
425
426
427
428
429
        if self.model_config:
            if self.model_config.pooler_config:
                pooling_type = self.model_config.pooler_config.pooling_type
                if pooling_type is None or pooling_type.lower() != "last":
                    disable_chunked_prefill_reasons.append(
                        "Only \"last\" pooling supports chunked "
                        "prefill and prefix caching; disabling both.")
430
431
432
433
                if not getattr(self.model_config.hf_config, "is_causal", True):
                    disable_chunked_prefill_reasons.append(
                        "Only models using causal attention supports chunked "
                        "prefill and prefix caching; disabling both.")
434
435
436
437
438
439
440
441
            elif self.model_config.is_encoder_decoder:
                self.scheduler_config.max_num_encoder_input_tokens = \
                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
                logger.debug(
                    "Encoder-decoder model detected: setting "
                    "`max_num_encoder_input_tokens` to encoder length (%s)",
                    self.scheduler_config.max_num_encoder_input_tokens)
                self.scheduler_config.disable_chunked_mm_input = True
442
                disable_chunked_prefill_reasons.append(
443
444
445
446
447
448
449
450
451
452
453
                    "Encoder-decoder models do not support chunked prefill nor"
                    " prefix caching; disabling both.")
                if (self.model_config.architecture
                        == "WhisperForConditionalGeneration"
                        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
                        != "spawn"):
                    logger.warning(
                        "Whisper is known to have issues with "
                        "forked workers. If startup is hanging, "
                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
                        "to 'spawn'.")
454
455
456
457
458
459
460
461
462
463

        if disable_chunked_prefill_reasons:
            for reason in disable_chunked_prefill_reasons:
                logger.info(reason)
            self.scheduler_config.chunked_prefill_enabled = False
            self.scheduler_config.long_prefill_token_threshold = 0

            if self.cache_config is not None:
                self.cache_config.enable_prefix_caching = False

464
        if (self.kv_events_config is not None
465
466
467
468
469
                and self.kv_events_config.enable_kv_cache_events
                and not self.cache_config.enable_prefix_caching):
            logger.warning(
                "KV cache events are on, but prefix caching is not enabled."
                "Use --enable-prefix-caching to enable.")
470
471
        if (self.kv_events_config is not None
                and self.kv_events_config.publisher != "null"
472
473
474
475
476
                and not self.kv_events_config.enable_kv_cache_events):
            logger.warning("KV cache events are disabled,"
                           "but the scheduler is configured to publish them."
                           "Modify KVEventsConfig.enable_kv_cache_events"
                           "to True to enable.")
477
478
        current_platform.check_and_update_config(self)

479
        # final check of cudagraph mode after platform-specific update
480
        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
                and self.model_config is not None and \
                not self.model_config.disable_cascade_attn:
                logger.info("CUDAGraphMode.FULL is not supported with "
                            "cascade attention currently. Disabling cascade"
                            "attention.")
                self.model_config.disable_cascade_attn = True

            if self.compilation_config.cudagraph_mode\
                .requires_piecewise_compilation():
                assert self.compilation_config.level == \
                    CompilationLevel.PIECEWISE, \
                    "Compilation level should be CompilationLevel.PIECEWISE "\
                    "when cudagraph_mode piecewise cudagraphs is used, "\
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

497
498
        if self.parallel_config.enable_dbo:
            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
499
500
501
502
503
504
505
            assert a2a_backend in \
                ["deepep_low_latency", "deepep_high_throughput"], \
            "Microbatching currently only supports the deepep_low_latency and "\
            f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
            "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
            "variable to deepep_low_latency or deepep_high_throughput and "\
            "install the DeepEP kernels."
506

507
508
509
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]

510
511
512
513
514
        # Do this after all the updates to compilation_config.level
        if envs.VLLM_USE_V1 and \
            self.compilation_config.level == CompilationLevel.PIECEWISE:
            self.compilation_config.set_splitting_ops_for_v1()

515
516
517
518
519
        if (envs.VLLM_USE_V1
                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
            # logger should only print warning message for hybrid models. As we
            # can't know whether the model is hybrid or not now, so we don't log
            # warning message here and will log it later.
520
            if not current_platform.support_hybrid_kv_cache():
521
                # Hybrid KV cache manager is not supported on non-GPU platforms.
522
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
523
524
            if self.kv_transfer_config is not None:
                # Hybrid KV cache manager is not compatible with KV transfer.
525
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
526
527
            if self.kv_events_config is not None:
                # Hybrid KV cache manager is not compatible with KV events.
528
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
529
            if self.model_config is not None and \
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
                self.model_config.attention_chunk_size is not None:
                if self.speculative_config is not None and \
                    self.speculative_config.use_eagle():
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention + eagle.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
                elif \
                    not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
                    logger.warning(
                        "There is a latency regression when using chunked local"
                        " attention with the hybrid KV cache manager. Disabling"
                        " it, by default. To enable it, set the environment "
                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
                    )
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
547

548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
        def has_blocked_weights():
            if self.quant_config is not None:
                if hasattr(self.quant_config, "weight_block_size"):
                    return self.quant_config.weight_block_size is not None
                elif hasattr(self.quant_config, "has_blocked_weights"):
                    return self.quant_config.has_blocked_weights()
            return False

        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
        # On H100 the CUDA kernel is faster than
        # native implementation
        # https://github.com/vllm-project/vllm/issues/25094
        if has_blocked_weights():
            custom_ops = self.compilation_config.custom_ops
            if "none" not in custom_ops and "-quant_fp8" not in custom_ops:
                custom_ops.append("+quant_fp8")

565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list:
        # remove the sizes that not multiple of tp_size when
        # enable sequence parallelism
        removed_sizes = [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size != 0
        ]
        if removed_sizes:
            logger.warning(
                "Batch sizes %s are removed because they are not "
                "multiple of tp_size %d when "
                "sequence parallelism is enabled", removed_sizes,
                self.parallel_config.tensor_parallel_size)

        return [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size == 0
        ]

585
586
    def _set_cudagraph_sizes(self):
        """
587
588
        vLLM defines the default candidate list of batch sizes for CUDA graph
        capture as:
589

590
591
592
593
        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
        # 1, 2, 4, then multiples of 8 up to max_graph_size
        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
594

595
596
        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in descending order).
597

598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
        significantly faster kernel dispatch by avoiding Python overhead. The
        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
        most GPUs), which controls the total allowed number of tokens in a
        batch. Since each sequence may have a variable number of tokens, the
        maximum usable batch size will depend on actual sequence lengths.

        Example:
            With `max_num_batched_tokens = 8192`, and typical sequences
            averaging ~32 tokens, most practical batch sizes fall below 256.
            However, the system will still allow capture sizes up to 512 if
            shape and memory permit.

        Note:
            If users explicitly specify cudagraph capture sizes in the
            compilation config, those will override this default logic.
            At runtime:

            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
            padded CUDA graph will be used.
            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
            not be used.
621
622
623
624
625
626
627
628
629
630
        """

        # calculate the default `batch_size_capture_list`
        if not envs.VLLM_USE_V1:
            batch_size_capture_list = []
            if self.scheduler_config is not None and \
                self.model_config is not None and \
                    not self.model_config.enforce_eager:

                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
631
632
633
634
635
                if self.parallel_config.tensor_parallel_size > 1 and \
                    self.compilation_config.pass_config.enable_sequence_parallelism:
                    possible_sizes = self.update_sizes_for_sequence_parallelism(
                        possible_sizes)

636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
                # find the minimum size that is larger than max_num_seqs,
                # which then becomes the max_batchsize_to_capture
                larger_sizes = [
                    x for x in possible_sizes
                    if x >= self.scheduler_config.max_num_seqs
                ]
                if larger_sizes:
                    max_batchsize_to_capture = larger_sizes[0]
                else:
                    max_batchsize_to_capture = possible_sizes[-1]

                # filter out the sizes that are
                # larger than max_batchsize_to_capture
                batch_size_capture_list = [
                    size for size in possible_sizes
                    if size <= max_batchsize_to_capture
                ]
        else:
            batch_size_capture_list = []
            if self.model_config is not None and \
                not self.model_config.enforce_eager:
657
658
659
660
661
662
663
664
                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
                if len(cuda_graph_sizes) == 1:
                    batch_size_capture_list = [1, 2, 4] + [
                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
                    ]
                elif len(cuda_graph_sizes) > 1:
                    batch_size_capture_list = sorted(cuda_graph_sizes)
                else:
Cyrus Leung's avatar
Cyrus Leung committed
665
                    raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
666
667
668
669
                if self.parallel_config.tensor_parallel_size > 1 and \
                    self.compilation_config.pass_config.enable_sequence_parallelism:
                    batch_size_capture_list = \
                        self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
670
671
672
673
674
                max_num_tokens = self.scheduler_config.max_num_batched_tokens
                batch_size_capture_list = [
                    size for size in batch_size_capture_list
                    if size <= max_num_tokens
                ]
675
676
677
678

        self.compilation_config.init_with_cudagraph_sizes(
            batch_size_capture_list)

679
    def recalculate_max_model_len(self, max_model_len: int):
680
        # Can only be called in try_verify_and_update_config
681
        model_config = self.model_config
682
        max_model_len = model_config.get_and_verify_max_len(max_model_len)
683
684
        self.model_config.max_model_len = max_model_len
        self.scheduler_config.max_model_len = max_model_len
685
686

    def try_verify_and_update_config(self):
687
688
689
        if self.model_config is None:
            return

690
691
692
693
694
        # Avoid running try_verify_and_update_config multiple times
        if getattr(self.model_config, "config_updated", False):
            return
        self.model_config.config_updated = True

695
        architecture = self.model_config.architecture
696
697
698
        if architecture is None:
            return

699
700
        from vllm.model_executor.models.config import (
            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
701
702
703
        cls = MODELS_CONFIG_MAP.get(architecture, None)
        if cls is not None:
            cls.verify_and_update_config(self)
704

705
706
707
        if self.model_config.is_hybrid:
            HybridAttentionMambaModelConfig.verify_and_update_config(self)

708
        if self.model_config.convert_type == "classify":
709
710
711
712
713
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import (
                SequenceClassificationConfig)
            SequenceClassificationConfig.verify_and_update_config(self)

714
715
716
717
718
719
720
721
722
723
724
725
        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
                self.model_config.model_weights):
            if self.load_config.load_format == "auto":
                logger.info("Detected Run:ai model config. "
                            "Overriding `load_format` to 'runai_streamer'")
                self.load_config.load_format = "runai_streamer"
            elif self.load_config.load_format != "runai_streamer":
                raise ValueError(f"To load a model from S3, 'load_format' "
                                 f"must be 'runai_streamer', "
                                 f"but got '{self.load_config.load_format}'. "
                                 f"Model: {self.model_config.model}")

726
    def __str__(self):
727
        return (
728
729
730
731
732
            f"model={self.model_config.model!r}, "
            f"speculative_config={self.speculative_config!r}, "
            f"tokenizer={self.model_config.tokenizer!r}, "
            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
733
            f"revision={self.model_config.revision}, "
734
            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
735
736
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "
737
738
            f"max_seq_len={self.model_config.max_model_len}, "
            f"download_dir={self.load_config.download_dir!r}, "
739
            f"load_format={self.load_config.load_format}, "
740
741
            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
742
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
743
744
745
746
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
747
            f"device_config={self.device_config.device}, "
748
            f"structured_outputs_config={self.structured_outputs_config!r}, "
749
750
751
752
753
            f"observability_config={self.observability_config!r}, "
            f"seed={self.model_config.seed}, "
            f"served_model_name={self.model_config.served_model_name}, "
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
754
755
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")
756
757
758


_current_vllm_config: Optional[VllmConfig] = None
759
_current_prefix: Optional[str] = None
760
761
762


@contextmanager
763
764
765
def set_current_vllm_config(vllm_config: VllmConfig,
                            check_compile=False,
                            prefix: Optional[str] = None):
766
    """
767
    Temporarily set the current vLLM config.
768
    Used during model initialization.
769
    We save the current vLLM config in a global variable,
770
    so that all modules can access it, e.g. custom ops
771
    can access the vLLM config to determine how to dispatch.
772
    """
773
    global _current_vllm_config, _current_prefix
774
    old_vllm_config = _current_vllm_config
775
    old_prefix = _current_prefix
776
777
778
779
    from vllm.compilation.counter import compilation_counter
    num_models_seen = compilation_counter.num_models_seen
    try:
        _current_vllm_config = vllm_config
780
        _current_prefix = prefix
781
        yield
782
783
784
    except Exception:
        raise
    else:
785
786
787
        if check_compile:
            vllm_config.compilation_config.custom_op_log_check()

788
789
        if check_compile and \
            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
790
791
792
793
794
795
796
797
798
            and compilation_counter.num_models_seen == num_models_seen:
            # If the model supports compilation,
            # compilation_counter.num_models_seen should be increased
            # by at least 1.
            # If it is not increased, it means the model does not support
            # compilation (does not have @support_torch_compile decorator).
            logger.warning(
                "`torch.compile` is turned on, but the model %s"
                " does not support it. Please open an issue on GitHub"
799
                " if you want it to be supported.",
800
                vllm_config.model_config.model)
801
    finally:
802
        _current_vllm_config = old_vllm_config
803
        _current_prefix = old_prefix
804
805
806
807
808
809
810
811
        # Clear the compilation config cache when context changes
        get_cached_compilation_config.cache_clear()


@lru_cache(maxsize=1)
def get_cached_compilation_config():
    """Cache config to avoid repeated calls to get_current_vllm_config()"""
    return get_current_vllm_config().compilation_config
812
813
814
815
816
817
818


def get_current_vllm_config() -> VllmConfig:
    if _current_vllm_config is None:
        # in ci, usually when we test custom ops/modules directly,
        # we don't set the vllm config. In that case, we set a default
        # config.
819
        logger.warning("Current vLLM config is not set.")
820
821
822
        from vllm.config import VllmConfig
        return VllmConfig()
    return _current_vllm_config
823
824


825
826
827
828
829
830
831
832
833
def get_current_model_prefix() -> str:
    """
    Get the prefix of the model that's currently being initialized.
    """
    assert _current_prefix is not None, \
        "Current model prefix is not set. "
    return _current_prefix


834
835
836
T = TypeVar("T")


837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
def get_layers_from_vllm_config(
        vllm_config: VllmConfig,
        layer_type: type[T],
        layer_names: Optional[list[str]] = None) -> dict[str, T]:
    """
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    """

    if layer_names is None:
        layer_names = list(
            vllm_config.compilation_config.static_forward_context.keys())

    forward_context = vllm_config.compilation_config.static_forward_context

856
    return {
857
858
859
        layer_name: forward_context[layer_name]
        for layer_name in layer_names
        if isinstance(forward_context[layer_name], layer_type)
860
    }
861
862


863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
def update_config(config: DataclassInstanceT,
                  overrides: dict[str, Any]) -> DataclassInstanceT:
    processed_overrides = {}
    for field_name, value in overrides.items():
        assert hasattr(
            config, field_name), f"{type(config)} has no field `{field_name}`"
        current_value = getattr(config, field_name)
        if is_dataclass(current_value) and not is_dataclass(value):
            assert isinstance(value, dict), (
                f"Overrides to {type(config)}.{field_name} must be a dict"
                f"  or {type(current_value)}, but got {type(value)}")
            value = update_config(
                current_value,  # type: ignore[type-var]
                value)
        processed_overrides[field_name] = value
    return replace(config, **processed_overrides)