__init__.py 37.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

zhuwenwen's avatar
zhuwenwen committed
4
import os
5

6
# ruff: noqa: F401
7
import ast
8
import copy
9
import hashlib
10
import inspect
11
import json
12
import os
13
import textwrap
14
from contextlib import contextmanager
15
from dataclasses import field, fields, is_dataclass, replace
16
from functools import cached_property, lru_cache
17

18
from importlib.util import find_spec
19

20
21

from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar, List,
22
                    Union, cast)
23
24


25
import regex as re
26
import torch
27
from pydantic import ConfigDict, SkipValidation
28
from pydantic.dataclasses import dataclass
29
from typing_extensions import runtime_checkable
30

31
import vllm.envs as envs
32
from vllm import version
33
from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
34
                               PrefixCachingHashAlgo)
35
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
36
                                     CUDAGraphMode, PassConfig)
37
from vllm.config.device import Device, DeviceConfig
38
from vllm.config.kv_events import KVEventsConfig
39
from vllm.config.kv_transfer import KVTransferConfig
40
from vllm.config.load import LoadConfig
41
from vllm.config.lora import LoRAConfig
42
43
44
45
46
from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode,
                               ModelConfig, ModelDType, ModelImpl,
                               RunnerOption, TaskOption, TokenizerMode,
                               iter_architecture_defaults,
                               try_match_architecture_defaults)
47
48
from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
                                    MultiModalConfig)
49
from vllm.config.observability import DetailedTraceModules, ObservabilityConfig
50
51
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                  ParallelConfig)
52
from vllm.config.pooler import PoolerConfig
53
from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
54
from vllm.config.speculative import SpeculativeConfig
55
from vllm.config.speech_to_text import SpeechToTextConfig
56
from vllm.config.structured_outputs import StructuredOutputsConfig
57
from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
Woosuk Kwon's avatar
Woosuk Kwon committed
58
from vllm.logger import init_logger
59
from vllm.multimodal import MULTIMODAL_REGISTRY
60

61
62
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
63
from vllm.platforms import current_platform
64
from vllm import envs
65

66
if TYPE_CHECKING:
67
    from _typeshed import DataclassInstance
68
    from transformers.configuration_utils import PretrainedConfig
69

70
71
72
    from vllm.model_executor.layers.quantization.base_config import (
        QuantizationConfig)
else:
73
    DataclassInstance = Any
74
    PretrainedConfig = Any
75
    QuantizationConfig = Any
76
    QuantizationMethods = Any
77
    BaseModelLoader = Any
78
    LogitsProcessor = Any
79

80
81
logger = init_logger(__name__)

82
DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
83

84

85
@runtime_checkable
86
87
88
89
90
91
class SupportsHash(Protocol):

    def compute_hash(self) -> str:
        ...


92
93
class SupportsMetricsInfo(Protocol):

94
    def metrics_info(self) -> dict[str, str]:
95
96
97
        ...


98
@config
99
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
100
101
class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
102
103
104
    simplifies passing around the distinct configurations in the codebase.
    """

105
106
107
    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
    model_config: ModelConfig = None  # type: ignore
108
109
110
111
112
113
114
115
116
117
118
    """Model configuration."""
    cache_config: CacheConfig = field(default_factory=CacheConfig)
    """Cache configuration."""
    parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
    """Parallel configuration."""
    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
    """Scheduler configuration."""
    device_config: DeviceConfig = field(default_factory=DeviceConfig)
    """Device configuration."""
    load_config: LoadConfig = field(default_factory=LoadConfig)
    """Load configuration."""
119
    lora_config: Optional[LoRAConfig] = None
120
121
122
    """LoRA configuration."""
    speculative_config: Optional[SpeculativeConfig] = None
    """Speculative decoding configuration."""
123
124
125
    structured_outputs_config: StructuredOutputsConfig = field(
        default_factory=StructuredOutputsConfig)
    """Structured outputs configuration."""
126
    observability_config: Optional[ObservabilityConfig] = None
127
    """Observability configuration."""
128
    quant_config: Optional[QuantizationConfig] = None
129
130
131
    """Quantization configuration."""
    compilation_config: CompilationConfig = field(
        default_factory=CompilationConfig)
132
    """`torch.compile` and cudagraph capture configuration for the model.
133

134
135
    As a shorthand, `-O<n>` can be used to directly specify the compilation
    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
136
    Currently, -O <n> and -O=<n> are supported as well but this will likely be
137
    removed in favor of clearer -O<n> syntax in the future.
138
139
140

    NOTE: level 0 is the default level without any optimization. level 1 and 2
    are for internal testing only. level 3 is the recommended level for
141
    production, also default in V1.
142
143
144
145
146
147

    You can specify the full compilation config like so:
    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    kv_transfer_config: Optional[KVTransferConfig] = None
    """The configurations for distributed KV cache transfer."""
148
    kv_events_config: Optional[KVEventsConfig] = None
149
    """The configurations for event publishing."""
150
    # some opaque config, only used to provide additional information
151
152
    # for the hash computation, mainly used for testing, debugging or out of
    # tree config registration.
153
154
155
156
    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
    """Additional config for specified platform. Different platforms may
    support different configs. Make sure the configs are valid for the platform
    you are using. Contents must be hashable."""
157
    instance_id: str = ""
158
    """The ID of the vLLM instance."""
159

160
161
162
163
164
165
166
167
168
169
170
171
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
172
        factors: list[Any] = []
173
174

        # summarize vllm config
175
        vllm_factors: list[Any] = []
176
177
        from vllm import __version__
        vllm_factors.append(__version__)
178
        vllm_factors.append(envs.VLLM_USE_V1)
179
180
        if self.model_config:
            vllm_factors.append(self.model_config.compute_hash())
181
182
        else:
            vllm_factors.append("None")
183
184
        if self.cache_config:
            vllm_factors.append(self.cache_config.compute_hash())
185
186
        else:
            vllm_factors.append("None")
187
188
        if self.parallel_config:
            vllm_factors.append(self.parallel_config.compute_hash())
189
190
        else:
            vllm_factors.append("None")
191
192
        if self.scheduler_config:
            vllm_factors.append(self.scheduler_config.compute_hash())
193
194
        else:
            vllm_factors.append("None")
195
196
        if self.device_config:
            vllm_factors.append(self.device_config.compute_hash())
197
198
        else:
            vllm_factors.append("None")
199
200
        if self.load_config:
            vllm_factors.append(self.load_config.compute_hash())
201
202
        else:
            vllm_factors.append("None")
203
204
        if self.lora_config:
            vllm_factors.append(self.lora_config.compute_hash())
205
206
207
208
209
            # LoRA creates static buffers based on max_num_batched_tokens.
            # The tensor sizes and strides get captured in the torch.compile
            # graph explicitly.
            vllm_factors.append(
                str(self.scheduler_config.max_num_batched_tokens))
210
211
        else:
            vllm_factors.append("None")
212
213
        if self.speculative_config:
            vllm_factors.append(self.speculative_config.compute_hash())
214
215
        else:
            vllm_factors.append("None")
216
217
        if self.structured_outputs_config:
            vllm_factors.append(self.structured_outputs_config.compute_hash())
218
219
        else:
            vllm_factors.append("None")
220
221
        if self.observability_config:
            vllm_factors.append(self.observability_config.compute_hash())
222
223
        else:
            vllm_factors.append("None")
224
225
226
227
        if self.quant_config:
            pass  # should be captured by model_config.quantization
        if self.compilation_config:
            vllm_factors.append(self.compilation_config.compute_hash())
228
229
        else:
            vllm_factors.append("None")
230
231
        if self.kv_transfer_config:
            vllm_factors.append(self.kv_transfer_config.compute_hash())
232
233
234
        else:
            vllm_factors.append("None")
        if self.additional_config:
235
236
237
238
239
240
241
242
            if isinstance(additional_config := self.additional_config, dict):
                additional_config_hash = hashlib.md5(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
            else:
                additional_config_hash = additional_config.compute_hash()
            vllm_factors.append(additional_config_hash)
243
244
        else:
            vllm_factors.append("None")
245
246
        factors.append(vllm_factors)

247
248
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()[:10]
249
250
        return hash_str

251
252
253
254
255
256
    def pad_for_cudagraph(self, batch_size: int) -> int:
        # if batch_size > self.compilation_config.max_capture_size,
        # it should raise an IndexError.
        # the caller should make sure the batch_size is within the range,
        # i.e., batch_size <= self.compilation_config.max_capture_size
        return self.compilation_config.bs_to_padded_graph_size[batch_size]
257

258
259
260
261
262
    @staticmethod
    def _get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        """Get the quantization config."""
263
        from vllm.platforms import current_platform
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
        if model_config.quantization is not None:
            from vllm.model_executor.model_loader.weight_utils import (
                get_quant_config)
            quant_config = get_quant_config(model_config, load_config)
            capability_tuple = current_platform.get_device_capability()

            if capability_tuple is not None:
                capability = capability_tuple.to_int()
                if capability < quant_config.get_min_capability():
                    raise ValueError(
                        f"The quantization method {model_config.quantization} "
                        "is not supported for the current GPU. Minimum "
                        f"capability: {quant_config.get_min_capability()}. "
                        f"Current capability: {capability}.")
            supported_dtypes = quant_config.get_supported_act_dtypes()
            if model_config.dtype not in supported_dtypes:
                raise ValueError(
                    f"{model_config.dtype} is not supported for quantization "
                    f"method {model_config.quantization}. Supported dtypes: "
                    f"{supported_dtypes}")
284
            quant_config.maybe_update_config(model_config.model)
285
286
            return quant_config
        return None
287

288
289
290
291
292
293
294
295
296
297
298
    @staticmethod
    def get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        import copy

        # For some reason, the _ version of this modifies the model_config
        # object, so using deepcopy to avoid this problem.
        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
                                                   load_config)

299
300
301
302
303
304
305
306
307
    def with_hf_config(
        self,
        hf_config: PretrainedConfig,
        architectures: Optional[list[str]] = None,
    ) -> "VllmConfig":
        if architectures is not None:
            hf_config = copy.deepcopy(hf_config)
            hf_config.architectures = architectures

308
309
310
311
        model_config = copy.deepcopy(self.model_config)
        model_config.hf_config = hf_config

        return replace(self, model_config=model_config)
312
313
314
315

    def __post_init__(self):
        """Verify configs are valid & consistent with each other.
        """
316
317
318

        self.try_verify_and_update_config()

319
320
        if self.model_config is not None:
            self.model_config.verify_with_parallel_config(self.parallel_config)
321
322
            self.model_config.verify_dual_chunk_attention_config(
                self.load_config)
323

324
        self.cache_config.verify_with_parallel_config(self.parallel_config)
325

326
        if self.lora_config is not None:
327
            self.lora_config.verify_with_cache_config(self.cache_config)
328
329
            self.lora_config.verify_with_model_config(self.model_config)

330
        if self.quant_config is None and self.model_config is not None:
331
332
            self.quant_config = VllmConfig._get_quantization_config(
                self.model_config, self.load_config)
333

334
        from vllm.platforms import current_platform
335
        if self.model_config is not None and \
336
337
338
            self.scheduler_config.chunked_prefill_enabled and \
            self.model_config.dtype == torch.float32 and \
            current_platform.get_device_capability() == (7, 5):
339
            logger.warning_once(
340
341
342
343
                "Turing devices tensor cores do not support float32 matmul. "
                "To workaround this limitation, vLLM will set 'ieee' input "
                "precision for chunked prefill triton kernels.")

344
345
346
347
348
349
350
351
352
353
354
        # If the user does not explicitly set a compilation level, then
        # we use the default level. The default level depends on other
        # settings (see the below code).
        if self.compilation_config.level is None:
            if envs.VLLM_USE_V1:
                if (self.model_config is not None
                        and not self.model_config.enforce_eager):
                    self.compilation_config.level = CompilationLevel.PIECEWISE
                else:
                    self.compilation_config.level = \
                            CompilationLevel.NO_COMPILATION
355

356
357
358
359
360
            else:
                # NB: Passing both --enforce-eager and a compilation level
                # in V0 means the compilation level wins out.
                self.compilation_config.level = CompilationLevel.NO_COMPILATION

361
362
363
364
365
        # async tp is built on top of sequence parallelism
        # and requires it to be enabled.
        if self.compilation_config.pass_config.enable_async_tp:
            self.compilation_config.pass_config.enable_sequence_parallelism = \
                True
366
367
        if self.compilation_config.pass_config.enable_sequence_parallelism:
            self.compilation_config.custom_ops.append("+rms_norm")
368

369
        if current_platform.support_static_graph_mode():
370
371
372
373
374
            # if cudagraph_mode is not explicitly set by users, set default
            # value
            if self.compilation_config.cudagraph_mode is None:
                if envs.VLLM_USE_V1 and self.compilation_config.level \
                    == CompilationLevel.PIECEWISE:
375
376
377
378
379
380
381
382
383
384
385
386
387
                    if not envs.VLLM_USE_PIECEWISE:
                        # default to full and piecewise for most models
                        self.compilation_config.cudagraph_mode = \
                            CUDAGraphMode.FULL_AND_PIECEWISE

                        # pooling models and encoder-decoder models
                        # do not support full cudagraphs
                        if self.model_config is not None and \
                            (self.model_config.pooler_config is not None
                            or self.model_config.is_encoder_decoder):
                            self.compilation_config.cudagraph_mode = \
                                CUDAGraphMode.PIECEWISE
                    else:
388
389
                        self.compilation_config.cudagraph_mode = \
                            CUDAGraphMode.PIECEWISE
390
391
                else:
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
392

393
394
395
396
397
398
399
400
401
402
403
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and \
                    self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            elif envs.VLLM_USE_V1:
                self.compilation_config.cudagraph_num_of_warmups = 1

            self._set_cudagraph_sizes()
        else:
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
404

405
406
407
408
409
410
411
412
413
414
415
416
417
418
        if self.cache_config.kv_sharing_fast_prefill:

            if self.speculative_config is not None and \
                self.speculative_config.use_eagle():
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not "
                    "compatible with EAGLE as EAGLE requires correct logits "
                    "for all tokens while fast prefill gives incorrect logits "
                    "for prompt tokens.")

            logger.warning_once(
                "--kv-sharing-fast-prefill requires changes on model side for "
                "correctness and to realize prefill savings. ")

419
420
        disable_chunked_prefill_reasons: list[str] = []

421
422
423
424
425
426
427
        if self.model_config:
            if self.model_config.pooler_config:
                pooling_type = self.model_config.pooler_config.pooling_type
                if pooling_type is None or pooling_type.lower() != "last":
                    disable_chunked_prefill_reasons.append(
                        "Only \"last\" pooling supports chunked "
                        "prefill and prefix caching; disabling both.")
428
429
430
431
                if not getattr(self.model_config.hf_config, "is_causal", True):
                    disable_chunked_prefill_reasons.append(
                        "Only models using causal attention supports chunked "
                        "prefill and prefix caching; disabling both.")
432
433
434
435
436
437
438
439
            elif self.model_config.is_encoder_decoder:
                self.scheduler_config.max_num_encoder_input_tokens = \
                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
                logger.debug(
                    "Encoder-decoder model detected: setting "
                    "`max_num_encoder_input_tokens` to encoder length (%s)",
                    self.scheduler_config.max_num_encoder_input_tokens)
                self.scheduler_config.disable_chunked_mm_input = True
440
                disable_chunked_prefill_reasons.append(
441
442
443
444
445
446
447
448
449
450
451
                    "Encoder-decoder models do not support chunked prefill nor"
                    " prefix caching; disabling both.")
                if (self.model_config.architecture
                        == "WhisperForConditionalGeneration"
                        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
                        != "spawn"):
                    logger.warning(
                        "Whisper is known to have issues with "
                        "forked workers. If startup is hanging, "
                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
                        "to 'spawn'.")
452
453
454
455
456
457
458
459
460

        if disable_chunked_prefill_reasons:
            for reason in disable_chunked_prefill_reasons:
                logger.info(reason)
            self.scheduler_config.chunked_prefill_enabled = False
            self.scheduler_config.long_prefill_token_threshold = 0

            if self.cache_config is not None:
                self.cache_config.enable_prefix_caching = False
461

462
        if (self.kv_events_config is not None
463
464
465
466
467
                and self.kv_events_config.enable_kv_cache_events
                and not self.cache_config.enable_prefix_caching):
            logger.warning(
                "KV cache events are on, but prefix caching is not enabled."
                "Use --enable-prefix-caching to enable.")
468
469
        if (self.kv_events_config is not None
                and self.kv_events_config.publisher != "null"
470
471
472
473
474
                and not self.kv_events_config.enable_kv_cache_events):
            logger.warning("KV cache events are disabled,"
                           "but the scheduler is configured to publish them."
                           "Modify KVEventsConfig.enable_kv_cache_events"
                           "to True to enable.")
475
476
        current_platform.check_and_update_config(self)

477
        # final check of cudagraph mode after platform-specific update
478
        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
                and self.model_config is not None and \
                not self.model_config.disable_cascade_attn:
                logger.info("CUDAGraphMode.FULL is not supported with "
                            "cascade attention currently. Disabling cascade"
                            "attention.")
                self.model_config.disable_cascade_attn = True

            if self.compilation_config.cudagraph_mode\
                .requires_piecewise_compilation():
                assert self.compilation_config.level == \
                    CompilationLevel.PIECEWISE, \
                    "Compilation level should be CompilationLevel.PIECEWISE "\
                    "when cudagraph_mode piecewise cudagraphs is used, "\
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

495
496
        if self.parallel_config.enable_dbo:
            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
497
498
499
500
501
502
503
            assert a2a_backend in \
                ["deepep_low_latency", "deepep_high_throughput"], \
            "Microbatching currently only supports the deepep_low_latency and "\
            f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
            "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
            "variable to deepep_low_latency or deepep_high_throughput and "\
            "install the DeepEP kernels."
504

505
506
507
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]

508
509
510
511
512
        # Do this after all the updates to compilation_config.level
        if envs.VLLM_USE_V1 and \
            self.compilation_config.level == CompilationLevel.PIECEWISE:
            self.compilation_config.set_splitting_ops_for_v1()

513
514
515
516
517
        if (envs.VLLM_USE_V1
                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
            # logger should only print warning message for hybrid models. As we
            # can't know whether the model is hybrid or not now, so we don't log
            # warning message here and will log it later.
518
            if not current_platform.support_hybrid_kv_cache():
519
                # Hybrid KV cache manager is not supported on non-GPU platforms.
520
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
521
522
            if self.kv_transfer_config is not None:
                # Hybrid KV cache manager is not compatible with KV transfer.
523
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
524
525
            if self.kv_events_config is not None:
                # Hybrid KV cache manager is not compatible with KV events.
526
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
527
            if self.model_config is not None and \
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
                self.model_config.attention_chunk_size is not None:
                if self.speculative_config is not None and \
                    self.speculative_config.use_eagle():
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention + eagle.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
                elif \
                    not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
                    logger.warning(
                        "There is a latency regression when using chunked local"
                        " attention with the hybrid KV cache manager. Disabling"
                        " it, by default. To enable it, set the environment "
                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
                    )
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
545

546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list:
        # remove the sizes that not multiple of tp_size when
        # enable sequence parallelism
        removed_sizes = [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size != 0
        ]
        if removed_sizes:
            logger.warning(
                "Batch sizes %s are removed because they are not "
                "multiple of tp_size %d when "
                "sequence parallelism is enabled", removed_sizes,
                self.parallel_config.tensor_parallel_size)

        return [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size == 0
        ]

566
567
    def _set_cudagraph_sizes(self):
        """
568
569
        vLLM defines the default candidate list of batch sizes for CUDA graph
        capture as:
570

571
572
573
574
        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
        # 1, 2, 4, then multiples of 8 up to max_graph_size
        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
575

576
577
        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in descending order).
578

579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
        significantly faster kernel dispatch by avoiding Python overhead. The
        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
        most GPUs), which controls the total allowed number of tokens in a
        batch. Since each sequence may have a variable number of tokens, the
        maximum usable batch size will depend on actual sequence lengths.

        Example:
            With `max_num_batched_tokens = 8192`, and typical sequences
            averaging ~32 tokens, most practical batch sizes fall below 256.
            However, the system will still allow capture sizes up to 512 if
            shape and memory permit.

        Note:
            If users explicitly specify cudagraph capture sizes in the
            compilation config, those will override this default logic.
            At runtime:

            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
            padded CUDA graph will be used.
            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
            not be used.
602
        """
603
604

        # calculate the default `batch_size_capture_list`
605
606
607
        batch_size_capture_list = []
        if self.model_config is not None and \
            not self.model_config.enforce_eager:
608
609
610
611
            if self.model_config.use_mla and self.compilation_config.full_cuda_graph and self.scheduler_config.max_num_seqs <= 512:
                cuda_graph_sizes = [self.scheduler_config.max_num_seqs]
            else:
                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes 
612
613
614
            if len(cuda_graph_sizes) == 1:
                batch_size_capture_list = [1, 2, 4] + [
                    i for i in range(8, cuda_graph_sizes[0] + 1, 8)
615
                ]
616
617
618
619
620
621
622
623
624
625
626
627
628
            elif len(cuda_graph_sizes) > 1:
                batch_size_capture_list = sorted(cuda_graph_sizes)
            else:
                raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
            if self.parallel_config.tensor_parallel_size > 1 and \
                self.compilation_config.pass_config.enable_sequence_parallelism:
                batch_size_capture_list = \
                    self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
            max_num_tokens = self.scheduler_config.max_num_batched_tokens
            batch_size_capture_list = [
                size for size in batch_size_capture_list
                if size <= max_num_tokens
            ]
629
630
631
632
633
            
            # add for spec decode
            if self.speculative_config is not None and self.speculative_config.num_lookahead_slots > 0:
                batch_size_capture_list = list(map(lambda x: x * (1 + self.speculative_config.num_lookahead_slots),
                                                    batch_size_capture_list))
634
635
636
637

        self.compilation_config.init_with_cudagraph_sizes(
            batch_size_capture_list)

638
    def recalculate_max_model_len(self, max_model_len: int):
639
        # Can only be called in try_verify_and_update_config
640
        model_config = self.model_config
641
        max_model_len = model_config.get_and_verify_max_len(max_model_len)
642
643
        self.model_config.max_model_len = max_model_len
        self.scheduler_config.max_model_len = max_model_len
644
645

    def try_verify_and_update_config(self):
646
647
648
        if self.model_config is None:
            return

649
650
651
652
653
        # Avoid running try_verify_and_update_config multiple times
        if getattr(self.model_config, "config_updated", False):
            return
        self.model_config.config_updated = True

654
        architecture = self.model_config.architecture
655
656
657
        if architecture is None:
            return

658
659
        from vllm.model_executor.models.config import (
            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
660
661
662
        cls = MODELS_CONFIG_MAP.get(architecture, None)
        if cls is not None:
            cls.verify_and_update_config(self)
663

664
665
666
        if self.model_config.is_hybrid:
            HybridAttentionMambaModelConfig.verify_and_update_config(self)

667
        if self.model_config.convert_type == "classify":
668
669
670
671
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import (
                SequenceClassificationConfig)
            SequenceClassificationConfig.verify_and_update_config(self)
672

673
674
675
676
677
678
679
680
681
682
683
684
        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
                self.model_config.model_weights):
            if self.load_config.load_format == "auto":
                logger.info("Detected Run:ai model config. "
                            "Overriding `load_format` to 'runai_streamer'")
                self.load_config.load_format = "runai_streamer"
            elif self.load_config.load_format != "runai_streamer":
                raise ValueError(f"To load a model from S3, 'load_format' "
                                 f"must be 'runai_streamer', "
                                 f"but got '{self.load_config.load_format}'. "
                                 f"Model: {self.model_config.model}")

685
    def __str__(self):
686
        return (
687
688
689
690
691
            f"model={self.model_config.model!r}, "
            f"speculative_config={self.speculative_config!r}, "
            f"tokenizer={self.model_config.tokenizer!r}, "
            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
692
            f"revision={self.model_config.revision}, "
693
            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
694
695
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "
696
697
            f"max_seq_len={self.model_config.max_model_len}, "
            f"download_dir={self.load_config.download_dir!r}, "
698
            f"load_format={self.load_config.load_format}, "
699
700
            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
701
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
702
703
704
705
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
706
            f"device_config={self.device_config.device}, "
707
            f"structured_outputs_config={self.structured_outputs_config!r}, "
708
709
710
711
712
            f"observability_config={self.observability_config!r}, "
            f"seed={self.model_config.seed}, "
            f"served_model_name={self.model_config.served_model_name}, "
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
713
714
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")
715
716
717


_current_vllm_config: Optional[VllmConfig] = None
718
_current_prefix: Optional[str] = None
719
720
721


@contextmanager
722
723
724
def set_current_vllm_config(vllm_config: VllmConfig,
                            check_compile=False,
                            prefix: Optional[str] = None):
725
    """
726
    Temporarily set the current vLLM config.
727
    Used during model initialization.
728
    We save the current vLLM config in a global variable,
729
    so that all modules can access it, e.g. custom ops
730
    can access the vLLM config to determine how to dispatch.
731
    """
732
    global _current_vllm_config, _current_prefix
733
    old_vllm_config = _current_vllm_config
734
    old_prefix = _current_prefix
735
736
737
738
    from vllm.compilation.counter import compilation_counter
    num_models_seen = compilation_counter.num_models_seen
    try:
        _current_vllm_config = vllm_config
739
        _current_prefix = prefix
740
        yield
741
742
743
    except Exception:
        raise
    else:
744
745
746
        if check_compile:
            vllm_config.compilation_config.custom_op_log_check()

747
748
        if check_compile and \
            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
749
750
751
752
753
754
755
756
757
            and compilation_counter.num_models_seen == num_models_seen:
            # If the model supports compilation,
            # compilation_counter.num_models_seen should be increased
            # by at least 1.
            # If it is not increased, it means the model does not support
            # compilation (does not have @support_torch_compile decorator).
            logger.warning(
                "`torch.compile` is turned on, but the model %s"
                " does not support it. Please open an issue on GitHub"
758
                " if you want it to be supported.",
759
                vllm_config.model_config.model)
760
    finally:
761
        _current_vllm_config = old_vllm_config
762
        _current_prefix = old_prefix
763
764
765
766
767
768
769
770
        # Clear the compilation config cache when context changes
        get_cached_compilation_config.cache_clear()


@lru_cache(maxsize=1)
def get_cached_compilation_config():
    """Cache config to avoid repeated calls to get_current_vllm_config()"""
    return get_current_vllm_config().compilation_config
771
772
773
774
775
776
777


def get_current_vllm_config() -> VllmConfig:
    if _current_vllm_config is None:
        # in ci, usually when we test custom ops/modules directly,
        # we don't set the vllm config. In that case, we set a default
        # config.
778
        logger.warning("Current vLLM config is not set.")
779
780
781
        from vllm.config import VllmConfig
        return VllmConfig()
    return _current_vllm_config
782
783


784
785
786
787
788
789
790
791
792
def get_current_model_prefix() -> str:
    """
    Get the prefix of the model that's currently being initialized.
    """
    assert _current_prefix is not None, \
        "Current model prefix is not set. "
    return _current_prefix


793
794
795
T = TypeVar("T")


796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
def get_layers_from_vllm_config(
        vllm_config: VllmConfig,
        layer_type: type[T],
        layer_names: Optional[list[str]] = None) -> dict[str, T]:
    """
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    """

    if layer_names is None:
        layer_names = list(
            vllm_config.compilation_config.static_forward_context.keys())

    forward_context = vllm_config.compilation_config.static_forward_context

815
    return {
816
817
818
        layer_name: forward_context[layer_name]
        for layer_name in layer_names
        if isinstance(forward_context[layer_name], layer_type)
819
    }
820
821


822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
def update_config(config: DataclassInstanceT,
                  overrides: dict[str, Any]) -> DataclassInstanceT:
    processed_overrides = {}
    for field_name, value in overrides.items():
        assert hasattr(
            config, field_name), f"{type(config)} has no field `{field_name}`"
        current_value = getattr(config, field_name)
        if is_dataclass(current_value) and not is_dataclass(value):
            assert isinstance(value, dict), (
                f"Overrides to {type(config)}.{field_name} must be a dict"
                f"  or {type(current_value)}, but got {type(value)}")
            value = update_config(
                current_value,  # type: ignore[type-var]
                value)
        processed_overrides[field_name] = value
    return replace(config, **processed_overrides)