__init__.py 45.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
# ruff: noqa: F401
5
import ast
6
import copy
7
import hashlib
8
import inspect
9
import json
10
import os
11
import textwrap
12
from contextlib import contextmanager
13
from dataclasses import field, fields, is_dataclass, replace
14
from functools import cached_property, lru_cache
15
16
from typing import (TYPE_CHECKING, Any, Literal, Optional, Protocol, TypeVar,
                    Union, cast)
17

18
import regex as re
19
import torch
20
from pydantic import ConfigDict, SkipValidation
21
from pydantic.dataclasses import dataclass
22
from typing_extensions import runtime_checkable
23

24
import vllm.envs as envs
25
from vllm import version
26
from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
27
                               PrefixCachingHashAlgo)
28
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
29
                                     CUDAGraphMode, PassConfig)
30
from vllm.config.kv_events import KVEventsConfig
31
from vllm.config.kv_transfer import KVTransferConfig
32
from vllm.config.load import LoadConfig
33
from vllm.config.lora import LoRAConfig
34
35
36
37
38
from vllm.config.model import (ConvertOption, HfOverrides, LogprobsMode,
                               ModelConfig, ModelDType, ModelImpl,
                               RunnerOption, TaskOption, TokenizerMode,
                               iter_architecture_defaults,
                               try_match_architecture_defaults)
39
40
from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
                                    MultiModalConfig)
41
42
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                  ParallelConfig)
43
from vllm.config.pooler import PoolerConfig
44
from vllm.config.scheduler import RunnerType, SchedulerConfig, SchedulerPolicy
45
from vllm.config.speculative import SpeculativeConfig
46
from vllm.config.structured_outputs import StructuredOutputsConfig
47
from vllm.config.utils import ConfigType, config, get_attr_docs, is_init_field
Woosuk Kwon's avatar
Woosuk Kwon committed
48
from vllm.logger import init_logger
49
from vllm.multimodal import MULTIMODAL_REGISTRY
50
51
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid
52

53
if TYPE_CHECKING:
54
    from _typeshed import DataclassInstance
55
    from transformers.configuration_utils import PretrainedConfig
56

57
58
59
    from vllm.model_executor.layers.quantization.base_config import (
        QuantizationConfig)
else:
60
    DataclassInstance = Any
61
    PretrainedConfig = Any
62
    QuantizationConfig = Any
63
    QuantizationMethods = Any
64
    BaseModelLoader = Any
65
    LogitsProcessor = Any
66

67
logger = init_logger(__name__)
68
DataclassInstanceT = TypeVar("DataclassInstanceT", bound=DataclassInstance)
69

70

71
@runtime_checkable
72
73
74
75
76
77
class SupportsHash(Protocol):

    def compute_hash(self) -> str:
        ...


78
79
class SupportsMetricsInfo(Protocol):

80
    def metrics_info(self) -> dict[str, str]:
81
82
83
        ...


84
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
85
86
87


@config
88
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
89
class DeviceConfig:
90
91
    """Configuration for the device to use for vLLM execution."""

92
    device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
93
    """Device type for vLLM execution.
94
95
96
    This parameter is deprecated and will be
    removed in a future release.
    It will now be set automatically based
97
    on the current platform."""
98
99
100
    device_type: str = field(init=False)
    """Device type from the current platform. This is set in
    `__post_init__`."""
101

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # the device/platform information will be summarized
        # by torch/vllm automatically.
117
        factors: list[Any] = []
118
119
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()
120
121
        return hash_str

122
123
    def __post_init__(self):
        if self.device == "auto":
124
            # Automated device type detection
125
            from vllm.platforms import current_platform
126
            self.device_type = current_platform.device_type
127
            if not self.device_type:
128
129
130
131
                raise RuntimeError(
                    "Failed to infer device type, please set "
                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
                    "to turn on verbose logging to help debug the issue.")
132
133
        else:
            # Device type is assigned explicitly
134
135
136
137
            if isinstance(self.device, str):
                self.device_type = self.device
            elif isinstance(self.device, torch.device):
                self.device_type = self.device.type
138
139

        # Some device types require processing inputs on CPU
140
        if self.device_type in ["tpu"]:
141
            self.device = None
142
143
144
145
        else:
            # Set device with device type
            self.device = torch.device(self.device_type)

146

147
148
149
150
DetailedTraceModules = Literal["model", "worker", "all"]


@config
151
152
@dataclass
class ObservabilityConfig:
153
    """Configuration for observability - metrics and tracing."""
154

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    show_hidden_metrics_for_version: Optional[str] = None
    """Enable deprecated Prometheus metrics that have been hidden since the
    specified version. For example, if a previously deprecated metric has been
    hidden since the v0.7.0 release, you use
    `--show-hidden-metrics-for-version=0.7` as a temporary escape hatch while
    you migrate to new metrics. The metric is likely to be removed completely
    in an upcoming release."""

    @cached_property
    def show_hidden_metrics(self) -> bool:
        """Check if the hidden metrics should be shown."""
        if self.show_hidden_metrics_for_version is None:
            return False
        return version._prev_minor_version_was(
            self.show_hidden_metrics_for_version)
170

171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    otlp_traces_endpoint: Optional[str] = None
    """Target URL to which OpenTelemetry traces will be sent."""

    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
    """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
    set, it will collect detailed traces for the specified modules. This
    involves use of possibly costly and or blocking operations and hence might
    have a performance impact.

    Note that collecting detailed timing information for each request can be
    expensive."""

    @cached_property
    def collect_model_forward_time(self) -> bool:
        """Whether to collect model forward time for the request."""
        return (self.collect_detailed_traces is not None
                and ("model" in self.collect_detailed_traces
                     or "all" in self.collect_detailed_traces))

    @cached_property
    def collect_model_execute_time(self) -> bool:
        """Whether to collect model execute time for the request."""
        return (self.collect_detailed_traces is not None
                and ("worker" in self.collect_detailed_traces
                     or "all" in self.collect_detailed_traces))
196

197
198
199
200
201
202
203
204
205
206
207
208
209
210
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
211
        factors: list[Any] = []
212
213
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()
214
215
        return hash_str

216
    def __post_init__(self):
217
218
219
220
221
        if (self.collect_detailed_traces is not None
                and len(self.collect_detailed_traces) == 1
                and "," in self.collect_detailed_traces[0]):
            self._parse_collect_detailed_traces()

222
        from vllm.tracing import is_otel_available, otel_import_error_traceback
223
224
225
226
227
        if not is_otel_available() and self.otlp_traces_endpoint is not None:
            raise ValueError(
                "OpenTelemetry is not available. Unable to configure "
                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
                f"installed. Original error:\n{otel_import_error_traceback}")
228

229
230
231
232
233
234
    def _parse_collect_detailed_traces(self):
        assert isinstance(self.collect_detailed_traces, list)
        self.collect_detailed_traces = cast(
            list[DetailedTraceModules],
            self.collect_detailed_traces[0].split(","))

235

236
@config
237
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
238
239
class VllmConfig:
    """Dataclass which contains all vllm-related configuration. This
240
241
242
    simplifies passing around the distinct configurations in the codebase.
    """

243
244
245
    # TODO: use default_factory once default constructing ModelConfig doesn't
    # try to download a model
    model_config: ModelConfig = None  # type: ignore
246
247
248
249
250
251
252
253
254
255
256
    """Model configuration."""
    cache_config: CacheConfig = field(default_factory=CacheConfig)
    """Cache configuration."""
    parallel_config: ParallelConfig = field(default_factory=ParallelConfig)
    """Parallel configuration."""
    scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig)
    """Scheduler configuration."""
    device_config: DeviceConfig = field(default_factory=DeviceConfig)
    """Device configuration."""
    load_config: LoadConfig = field(default_factory=LoadConfig)
    """Load configuration."""
257
    lora_config: Optional[LoRAConfig] = None
258
259
260
    """LoRA configuration."""
    speculative_config: Optional[SpeculativeConfig] = None
    """Speculative decoding configuration."""
261
262
263
    structured_outputs_config: StructuredOutputsConfig = field(
        default_factory=StructuredOutputsConfig)
    """Structured outputs configuration."""
264
    observability_config: Optional[ObservabilityConfig] = None
265
    """Observability configuration."""
266
    quant_config: Optional[QuantizationConfig] = None
267
268
269
    """Quantization configuration."""
    compilation_config: CompilationConfig = field(
        default_factory=CompilationConfig)
270
    """`torch.compile` and cudagraph capture configuration for the model.
271

272
273
    As a shorthand, `-O<n>` can be used to directly specify the compilation
    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
274
    Currently, -O <n> and -O=<n> are supported as well but this will likely be
275
    removed in favor of clearer -O<n> syntax in the future.
276
277
278

    NOTE: level 0 is the default level without any optimization. level 1 and 2
    are for internal testing only. level 3 is the recommended level for
279
    production, also default in V1.
280
281
282
283
284
285

    You can specify the full compilation config like so:
    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
    """
    kv_transfer_config: Optional[KVTransferConfig] = None
    """The configurations for distributed KV cache transfer."""
286
    kv_events_config: Optional[KVEventsConfig] = None
287
    """The configurations for event publishing."""
288
    # some opaque config, only used to provide additional information
289
290
    # for the hash computation, mainly used for testing, debugging or out of
    # tree config registration.
291
292
293
294
    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
    """Additional config for specified platform. Different platforms may
    support different configs. Make sure the configs are valid for the platform
    you are using. Contents must be hashable."""
295
    instance_id: str = ""
296
    """The ID of the vLLM instance."""
297

298
299
300
301
302
303
304
305
306
307
308
309
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
        ensure that it is included in the factors list if
        it affects the computation graph.

        Provide a hash that uniquely identifies all the configs
        that affect the structure of the computation
        graph from input ids/embeddings to the final hidden states,
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
310
        factors: list[Any] = []
311
312

        # summarize vllm config
313
        vllm_factors: list[Any] = []
314
315
        from vllm import __version__
        vllm_factors.append(__version__)
316
        vllm_factors.append(envs.VLLM_USE_V1)
317
318
        if self.model_config:
            vllm_factors.append(self.model_config.compute_hash())
319
320
        else:
            vllm_factors.append("None")
321
322
        if self.cache_config:
            vllm_factors.append(self.cache_config.compute_hash())
323
324
        else:
            vllm_factors.append("None")
325
326
        if self.parallel_config:
            vllm_factors.append(self.parallel_config.compute_hash())
327
328
        else:
            vllm_factors.append("None")
329
330
        if self.scheduler_config:
            vllm_factors.append(self.scheduler_config.compute_hash())
331
332
        else:
            vllm_factors.append("None")
333
334
        if self.device_config:
            vllm_factors.append(self.device_config.compute_hash())
335
336
        else:
            vllm_factors.append("None")
337
338
        if self.load_config:
            vllm_factors.append(self.load_config.compute_hash())
339
340
        else:
            vllm_factors.append("None")
341
342
        if self.lora_config:
            vllm_factors.append(self.lora_config.compute_hash())
343
344
345
346
347
            # LoRA creates static buffers based on max_num_batched_tokens.
            # The tensor sizes and strides get captured in the torch.compile
            # graph explicitly.
            vllm_factors.append(
                str(self.scheduler_config.max_num_batched_tokens))
348
349
        else:
            vllm_factors.append("None")
350
351
        if self.speculative_config:
            vllm_factors.append(self.speculative_config.compute_hash())
352
353
        else:
            vllm_factors.append("None")
354
355
        if self.structured_outputs_config:
            vllm_factors.append(self.structured_outputs_config.compute_hash())
356
357
        else:
            vllm_factors.append("None")
358
359
        if self.observability_config:
            vllm_factors.append(self.observability_config.compute_hash())
360
361
        else:
            vllm_factors.append("None")
362
363
364
365
        if self.quant_config:
            pass  # should be captured by model_config.quantization
        if self.compilation_config:
            vllm_factors.append(self.compilation_config.compute_hash())
366
367
        else:
            vllm_factors.append("None")
368
369
        if self.kv_transfer_config:
            vllm_factors.append(self.kv_transfer_config.compute_hash())
370
371
372
        else:
            vllm_factors.append("None")
        if self.additional_config:
373
374
375
376
377
378
379
380
            if isinstance(additional_config := self.additional_config, dict):
                additional_config_hash = hashlib.md5(
                    json.dumps(additional_config, sort_keys=True).encode(),
                    usedforsecurity=False,
                ).hexdigest()
            else:
                additional_config_hash = additional_config.compute_hash()
            vllm_factors.append(additional_config_hash)
381
382
        else:
            vllm_factors.append("None")
383
384
        factors.append(vllm_factors)

385
386
        hash_str = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()[:10]
387
388
        return hash_str

389
390
391
392
393
394
    def pad_for_cudagraph(self, batch_size: int) -> int:
        # if batch_size > self.compilation_config.max_capture_size,
        # it should raise an IndexError.
        # the caller should make sure the batch_size is within the range,
        # i.e., batch_size <= self.compilation_config.max_capture_size
        return self.compilation_config.bs_to_padded_graph_size[batch_size]
395

396
397
398
399
400
    @staticmethod
    def _get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        """Get the quantization config."""
401
        from vllm.platforms import current_platform
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
        if model_config.quantization is not None:
            from vllm.model_executor.model_loader.weight_utils import (
                get_quant_config)
            quant_config = get_quant_config(model_config, load_config)
            capability_tuple = current_platform.get_device_capability()

            if capability_tuple is not None:
                capability = capability_tuple.to_int()
                if capability < quant_config.get_min_capability():
                    raise ValueError(
                        f"The quantization method {model_config.quantization} "
                        "is not supported for the current GPU. Minimum "
                        f"capability: {quant_config.get_min_capability()}. "
                        f"Current capability: {capability}.")
            supported_dtypes = quant_config.get_supported_act_dtypes()
            if model_config.dtype not in supported_dtypes:
                raise ValueError(
                    f"{model_config.dtype} is not supported for quantization "
                    f"method {model_config.quantization}. Supported dtypes: "
                    f"{supported_dtypes}")
            return quant_config
        return None
424

425
426
427
428
429
430
431
432
433
434
435
    @staticmethod
    def get_quantization_config(
            model_config: ModelConfig,
            load_config: LoadConfig) -> Optional[QuantizationConfig]:
        import copy

        # For some reason, the _ version of this modifies the model_config
        # object, so using deepcopy to avoid this problem.
        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
                                                   load_config)

436
437
438
439
440
441
442
443
444
    def with_hf_config(
        self,
        hf_config: PretrainedConfig,
        architectures: Optional[list[str]] = None,
    ) -> "VllmConfig":
        if architectures is not None:
            hf_config = copy.deepcopy(hf_config)
            hf_config.architectures = architectures

445
446
447
448
449
        model_config = copy.deepcopy(self.model_config)
        model_config.hf_config = hf_config

        return replace(self, model_config=model_config)

450
451
452
    def __post_init__(self):
        """Verify configs are valid & consistent with each other.
        """
453
454
455

        self.try_verify_and_update_config()

456
457
        if self.model_config is not None:
            self.model_config.verify_with_parallel_config(self.parallel_config)
458
459
            self.model_config.verify_dual_chunk_attention_config(
                self.load_config)
460

461
        self.cache_config.verify_with_parallel_config(self.parallel_config)
462

463
        if self.lora_config is not None:
464
            self.lora_config.verify_with_cache_config(self.cache_config)
465
            self.lora_config.verify_with_model_config(self.model_config)
466

467
        if self.quant_config is None and self.model_config is not None:
468
469
            self.quant_config = VllmConfig._get_quantization_config(
                self.model_config, self.load_config)
470

471
        from vllm.platforms import current_platform
472
        if self.model_config is not None and \
473
474
475
            self.scheduler_config.chunked_prefill_enabled and \
            self.model_config.dtype == torch.float32 and \
            current_platform.get_device_capability() == (7, 5):
476
            logger.warning_once(
477
478
479
480
                "Turing devices tensor cores do not support float32 matmul. "
                "To workaround this limitation, vLLM will set 'ieee' input "
                "precision for chunked prefill triton kernels.")

481
482
483
484
485
486
487
488
489
490
491
        # If the user does not explicitly set a compilation level, then
        # we use the default level. The default level depends on other
        # settings (see the below code).
        if self.compilation_config.level is None:
            if envs.VLLM_USE_V1:
                if (self.model_config is not None
                        and not self.model_config.enforce_eager):
                    self.compilation_config.level = CompilationLevel.PIECEWISE
                else:
                    self.compilation_config.level = \
                            CompilationLevel.NO_COMPILATION
492

493
494
495
496
497
            else:
                # NB: Passing both --enforce-eager and a compilation level
                # in V0 means the compilation level wins out.
                self.compilation_config.level = CompilationLevel.NO_COMPILATION

498
499
500
501
502
        # async tp is built on top of sequence parallelism
        # and requires it to be enabled.
        if self.compilation_config.pass_config.enable_async_tp:
            self.compilation_config.pass_config.enable_sequence_parallelism = \
                True
503
504
        if self.compilation_config.pass_config.enable_sequence_parallelism:
            self.compilation_config.custom_ops.append("+rms_norm")
505

506
        if current_platform.is_cuda_alike() or current_platform.is_xpu():
507
508
509
510
511
512
513
514
515
            # if cudagraph_mode is not explicitly set by users, set default
            # value
            if self.compilation_config.cudagraph_mode is None:
                if envs.VLLM_USE_V1 and self.compilation_config.level \
                    == CompilationLevel.PIECEWISE:
                    self.compilation_config.cudagraph_mode = \
                        CUDAGraphMode.PIECEWISE
                else:
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
516

517
518
519
520
521
522
523
524
525
526
527
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and \
                    self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")
                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            elif envs.VLLM_USE_V1:
                self.compilation_config.cudagraph_num_of_warmups = 1

            self._set_cudagraph_sizes()
        else:
            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
528

529
        if self.cache_config.cpu_offload_gb > 0 and \
530
531
            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
                and not envs.VLLM_USE_V1:
532
            logger.warning(
533
                "CPU offload is not supported with `torch.compile` in v0 yet."
534
535
536
                " Disabling `torch.compile`.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
        if self.cache_config.kv_sharing_fast_prefill:
            if not envs.VLLM_USE_V1:
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not supported "
                    "in V0 currently.")

            if self.speculative_config is not None and \
                self.speculative_config.use_eagle():
                raise NotImplementedError(
                    "Fast prefill optimization for KV sharing is not "
                    "compatible with EAGLE as EAGLE requires correct logits "
                    "for all tokens while fast prefill gives incorrect logits "
                    "for prompt tokens.")

            logger.warning_once(
                "--kv-sharing-fast-prefill requires changes on model side for "
                "correctness and to realize prefill savings. ")

555
556
557
558
559
560
        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
                and self.compilation_config.level
                != CompilationLevel.NO_COMPILATION):
            logger.warning(
                "LoRA for V0 is not supported with `torch.compile` yet. "
                "Disabling `torch.compile`.")
561
562
            self.compilation_config.level = CompilationLevel.NO_COMPILATION

563
564
        disable_chunked_prefill_reasons: list[str] = []

565
566
567
568
569
570
571
        if self.model_config:
            if self.model_config.pooler_config:
                pooling_type = self.model_config.pooler_config.pooling_type
                if pooling_type is None or pooling_type.lower() != "last":
                    disable_chunked_prefill_reasons.append(
                        "Only \"last\" pooling supports chunked "
                        "prefill and prefix caching; disabling both.")
572
573
574
575
                if not getattr(self.model_config.hf_config, "is_causal", True):
                    disable_chunked_prefill_reasons.append(
                        "Only models using causal attention supports chunked "
                        "prefill and prefix caching; disabling both.")
576
577
578
579
580
581
582
583
            elif self.model_config.is_encoder_decoder:
                self.scheduler_config.max_num_encoder_input_tokens = \
                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
                logger.debug(
                    "Encoder-decoder model detected: setting "
                    "`max_num_encoder_input_tokens` to encoder length (%s)",
                    self.scheduler_config.max_num_encoder_input_tokens)
                self.scheduler_config.disable_chunked_mm_input = True
584
                disable_chunked_prefill_reasons.append(
585
586
587
588
589
590
591
592
593
594
595
                    "Encoder-decoder models do not support chunked prefill nor"
                    " prefix caching; disabling both.")
                if (self.model_config.architecture
                        == "WhisperForConditionalGeneration"
                        and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
                        != "spawn"):
                    logger.warning(
                        "Whisper is known to have issues with "
                        "forked workers. If startup is hanging, "
                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
                        "to 'spawn'.")
596
597
598
599
600
601
602
603
604
605

        if disable_chunked_prefill_reasons:
            for reason in disable_chunked_prefill_reasons:
                logger.info(reason)
            self.scheduler_config.chunked_prefill_enabled = False
            self.scheduler_config.long_prefill_token_threshold = 0

            if self.cache_config is not None:
                self.cache_config.enable_prefix_caching = False

606
        if (self.kv_events_config is not None
607
608
609
610
611
                and self.kv_events_config.enable_kv_cache_events
                and not self.cache_config.enable_prefix_caching):
            logger.warning(
                "KV cache events are on, but prefix caching is not enabled."
                "Use --enable-prefix-caching to enable.")
612
613
        if (self.kv_events_config is not None
                and self.kv_events_config.publisher != "null"
614
615
616
617
618
                and not self.kv_events_config.enable_kv_cache_events):
            logger.warning("KV cache events are disabled,"
                           "but the scheduler is configured to publish them."
                           "Modify KVEventsConfig.enable_kv_cache_events"
                           "to True to enable.")
619
620
        current_platform.check_and_update_config(self)

621
        # final check of cudagraph mode after platform-specific update
622
        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
                and self.model_config is not None and \
                not self.model_config.disable_cascade_attn:
                logger.info("CUDAGraphMode.FULL is not supported with "
                            "cascade attention currently. Disabling cascade"
                            "attention.")
                self.model_config.disable_cascade_attn = True

            if self.compilation_config.cudagraph_mode\
                .requires_piecewise_compilation():
                assert self.compilation_config.level == \
                    CompilationLevel.PIECEWISE, \
                    "Compilation level should be CompilationLevel.PIECEWISE "\
                    "when cudagraph_mode piecewise cudagraphs is used, "\
                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"

639
640
641
642
643
644
645
646
        if self.parallel_config.enable_dbo:
            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
            assert a2a_backend == "deepep_low_latency", \
            "Microbatching currently only supports the deepep_low_latency "\
            f"all2all backend. {a2a_backend} is not supported. To fix set "\
            "the VLLM_ALL2ALL_BACKEND environment variable to "\
            "deepep_low_latency and install the DeepEP kerenls."

647
648
649
        if not self.instance_id:
            self.instance_id = random_uuid()[:5]

650
651
652
653
654
        # Do this after all the updates to compilation_config.level
        if envs.VLLM_USE_V1 and \
            self.compilation_config.level == CompilationLevel.PIECEWISE:
            self.compilation_config.set_splitting_ops_for_v1()

655
656
657
658
659
        if (envs.VLLM_USE_V1
                and not self.scheduler_config.disable_hybrid_kv_cache_manager):
            # logger should only print warning message for hybrid models. As we
            # can't know whether the model is hybrid or not now, so we don't log
            # warning message here and will log it later.
660
            if not current_platform.support_hybrid_kv_cache():
661
                # Hybrid KV cache manager is not supported on non-GPU platforms.
662
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
663
664
            if self.kv_transfer_config is not None:
                # Hybrid KV cache manager is not compatible with KV transfer.
665
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
666
667
            if self.kv_events_config is not None:
                # Hybrid KV cache manager is not compatible with KV events.
668
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
669
            if self.model_config is not None and \
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
                self.model_config.attention_chunk_size is not None:
                if self.speculative_config is not None and \
                    self.speculative_config.use_eagle():
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention + eagle.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
                elif \
                    not envs.VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE:
                    logger.warning(
                        "There is a latency regression when using chunked local"
                        " attention with the hybrid KV cache manager. Disabling"
                        " it, by default. To enable it, set the environment "
                        "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE=1."
                    )
                    # Hybrid KV cache manager is not yet supported with chunked
                    # local attention.
                    self.scheduler_config.disable_hybrid_kv_cache_manager = True
687

688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list:
        # remove the sizes that not multiple of tp_size when
        # enable sequence parallelism
        removed_sizes = [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size != 0
        ]
        if removed_sizes:
            logger.warning(
                "Batch sizes %s are removed because they are not "
                "multiple of tp_size %d when "
                "sequence parallelism is enabled", removed_sizes,
                self.parallel_config.tensor_parallel_size)

        return [
            size for size in possible_sizes
            if size % self.parallel_config.tensor_parallel_size == 0
        ]

708
709
    def _set_cudagraph_sizes(self):
        """
710
711
        vLLM defines the default candidate list of batch sizes for CUDA graph
        capture as:
712

713
714
715
716
        ```python
        max_graph_size = min(max_num_seqs * 2, 512)
        # 1, 2, 4, then multiples of 8 up to max_graph_size
        cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
717

718
719
        In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
        will be the final sizes to capture cudagraph (in descending order).
720

721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
        These sizes are used to capture and reuse CUDA graphs for
        performance-critical paths (e.g., decoding). Capturing enables
        significantly faster kernel dispatch by avoiding Python overhead. The
        list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
        most GPUs), which controls the total allowed number of tokens in a
        batch. Since each sequence may have a variable number of tokens, the
        maximum usable batch size will depend on actual sequence lengths.

        Example:
            With `max_num_batched_tokens = 8192`, and typical sequences
            averaging ~32 tokens, most practical batch sizes fall below 256.
            However, the system will still allow capture sizes up to 512 if
            shape and memory permit.

        Note:
            If users explicitly specify cudagraph capture sizes in the
            compilation config, those will override this default logic.
            At runtime:

            - If batch size <= one of the `cudagraph_capture_sizes`, the closest
            padded CUDA graph will be used.
            - If batch size > largest `cudagraph_capture_sizes`, cudagraph will
            not be used.
744
745
746
747
748
749
750
751
752
753
        """

        # calculate the default `batch_size_capture_list`
        if not envs.VLLM_USE_V1:
            batch_size_capture_list = []
            if self.scheduler_config is not None and \
                self.model_config is not None and \
                    not self.model_config.enforce_eager:

                possible_sizes = [1, 2, 4] + [8 * i for i in range(1, 1025)]
754
755
756
757
758
                if self.parallel_config.tensor_parallel_size > 1 and \
                    self.compilation_config.pass_config.enable_sequence_parallelism:
                    possible_sizes = self.update_sizes_for_sequence_parallelism(
                        possible_sizes)

759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
                # find the minimum size that is larger than max_num_seqs,
                # which then becomes the max_batchsize_to_capture
                larger_sizes = [
                    x for x in possible_sizes
                    if x >= self.scheduler_config.max_num_seqs
                ]
                if larger_sizes:
                    max_batchsize_to_capture = larger_sizes[0]
                else:
                    max_batchsize_to_capture = possible_sizes[-1]

                # filter out the sizes that are
                # larger than max_batchsize_to_capture
                batch_size_capture_list = [
                    size for size in possible_sizes
                    if size <= max_batchsize_to_capture
                ]
        else:
            batch_size_capture_list = []
            if self.model_config is not None and \
                not self.model_config.enforce_eager:
780
781
782
783
784
785
786
787
                cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
                if len(cuda_graph_sizes) == 1:
                    batch_size_capture_list = [1, 2, 4] + [
                        i for i in range(8, cuda_graph_sizes[0] + 1, 8)
                    ]
                elif len(cuda_graph_sizes) > 1:
                    batch_size_capture_list = sorted(cuda_graph_sizes)
                else:
Cyrus Leung's avatar
Cyrus Leung committed
788
                    raise TypeError(f"Invalid value for {cuda_graph_sizes=}.")
789
790
791
792
                if self.parallel_config.tensor_parallel_size > 1 and \
                    self.compilation_config.pass_config.enable_sequence_parallelism:
                    batch_size_capture_list = \
                        self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
793
794
795
796
797
                max_num_tokens = self.scheduler_config.max_num_batched_tokens
                batch_size_capture_list = [
                    size for size in batch_size_capture_list
                    if size <= max_num_tokens
                ]
798
799
800
801

        self.compilation_config.init_with_cudagraph_sizes(
            batch_size_capture_list)

802
    def recalculate_max_model_len(self, max_model_len: int):
803
        # Can only be called in try_verify_and_update_config
804
        model_config = self.model_config
805
        max_model_len = model_config.get_and_verify_max_len(max_model_len)
806
807
        self.model_config.max_model_len = max_model_len
        self.scheduler_config.max_model_len = max_model_len
808
809

    def try_verify_and_update_config(self):
810
811
812
        if self.model_config is None:
            return

813
814
815
816
817
        # Avoid running try_verify_and_update_config multiple times
        if getattr(self.model_config, "config_updated", False):
            return
        self.model_config.config_updated = True

818
        architecture = self.model_config.architecture
819
820
821
        if architecture is None:
            return

822
823
        from vllm.model_executor.models.config import (
            MODELS_CONFIG_MAP, HybridAttentionMambaModelConfig)
824
825
826
        cls = MODELS_CONFIG_MAP.get(architecture, None)
        if cls is not None:
            cls.verify_and_update_config(self)
827

828
829
830
        if self.model_config.is_hybrid:
            HybridAttentionMambaModelConfig.verify_and_update_config(self)

831
        if self.model_config.convert_type == "classify":
832
833
834
835
836
            # Maybe convert ForCausalLM into ForSequenceClassification model.
            from vllm.model_executor.models.adapters import (
                SequenceClassificationConfig)
            SequenceClassificationConfig.verify_and_update_config(self)

837
838
839
840
841
842
843
844
845
846
847
848
        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
                self.model_config.model_weights):
            if self.load_config.load_format == "auto":
                logger.info("Detected Run:ai model config. "
                            "Overriding `load_format` to 'runai_streamer'")
                self.load_config.load_format = "runai_streamer"
            elif self.load_config.load_format != "runai_streamer":
                raise ValueError(f"To load a model from S3, 'load_format' "
                                 f"must be 'runai_streamer', "
                                 f"but got '{self.load_config.load_format}'. "
                                 f"Model: {self.model_config.model}")

849
    def __str__(self):
850
        return (
851
852
853
854
855
            f"model={self.model_config.model!r}, "
            f"speculative_config={self.speculative_config!r}, "
            f"tokenizer={self.model_config.tokenizer!r}, "
            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
856
            f"revision={self.model_config.revision}, "
857
            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
858
859
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "
860
861
            f"max_seq_len={self.model_config.max_model_len}, "
            f"download_dir={self.load_config.download_dir!r}, "
862
            f"load_format={self.load_config.load_format}, "
863
864
            f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, "  # noqa
            f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, "  # noqa
865
            f"data_parallel_size={self.parallel_config.data_parallel_size}, "  # noqa
866
867
868
869
            f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, "  # noqa
            f"quantization={self.model_config.quantization}, "
            f"enforce_eager={self.model_config.enforce_eager}, "
            f"kv_cache_dtype={self.cache_config.cache_dtype}, "
870
            f"device_config={self.device_config.device}, "
871
            f"structured_outputs_config={self.structured_outputs_config!r}, "
872
873
874
875
876
            f"observability_config={self.observability_config!r}, "
            f"seed={self.model_config.seed}, "
            f"served_model_name={self.model_config.served_model_name}, "
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
877
878
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")
879
880
881


_current_vllm_config: Optional[VllmConfig] = None
882
_current_prefix: Optional[str] = None
883
884
885


@contextmanager
886
887
888
def set_current_vllm_config(vllm_config: VllmConfig,
                            check_compile=False,
                            prefix: Optional[str] = None):
889
    """
890
    Temporarily set the current vLLM config.
891
    Used during model initialization.
892
    We save the current vLLM config in a global variable,
893
    so that all modules can access it, e.g. custom ops
894
    can access the vLLM config to determine how to dispatch.
895
    """
896
    global _current_vllm_config, _current_prefix
897
    old_vllm_config = _current_vllm_config
898
    old_prefix = _current_prefix
899
900
901
902
    from vllm.compilation.counter import compilation_counter
    num_models_seen = compilation_counter.num_models_seen
    try:
        _current_vllm_config = vllm_config
903
        _current_prefix = prefix
904
        yield
905
906
907
    except Exception:
        raise
    else:
908
909
910
911
        logger.debug("enabled custom ops: %s",
                     vllm_config.compilation_config.enabled_custom_ops)
        logger.debug("disabled custom ops: %s",
                     vllm_config.compilation_config.disabled_custom_ops)
912
913
        if check_compile and \
            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \
914
915
916
917
918
919
920
921
922
            and compilation_counter.num_models_seen == num_models_seen:
            # If the model supports compilation,
            # compilation_counter.num_models_seen should be increased
            # by at least 1.
            # If it is not increased, it means the model does not support
            # compilation (does not have @support_torch_compile decorator).
            logger.warning(
                "`torch.compile` is turned on, but the model %s"
                " does not support it. Please open an issue on GitHub"
923
                " if you want it to be supported.",
924
                vllm_config.model_config.model)
925
    finally:
926
        _current_vllm_config = old_vllm_config
927
        _current_prefix = old_prefix
928
929
930
931
932
933
934
935
        # Clear the compilation config cache when context changes
        get_cached_compilation_config.cache_clear()


@lru_cache(maxsize=1)
def get_cached_compilation_config():
    """Cache config to avoid repeated calls to get_current_vllm_config()"""
    return get_current_vllm_config().compilation_config
936
937
938
939
940
941
942


def get_current_vllm_config() -> VllmConfig:
    if _current_vllm_config is None:
        # in ci, usually when we test custom ops/modules directly,
        # we don't set the vllm config. In that case, we set a default
        # config.
943
        logger.warning("Current vLLM config is not set.")
944
945
946
        from vllm.config import VllmConfig
        return VllmConfig()
    return _current_vllm_config
947
948


949
950
951
952
953
954
955
956
957
def get_current_model_prefix() -> str:
    """
    Get the prefix of the model that's currently being initialized.
    """
    assert _current_prefix is not None, \
        "Current model prefix is not set. "
    return _current_prefix


958
959
960
T = TypeVar("T")


961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
def get_layers_from_vllm_config(
        vllm_config: VllmConfig,
        layer_type: type[T],
        layer_names: Optional[list[str]] = None) -> dict[str, T]:
    """
    Get layers from the vLLM config.

    Args:
        vllm_config: The vLLM config.
        layer_type: The type of the layer to get.
        layer_names: The names of the layers to get. If None, return all layers.
    """

    if layer_names is None:
        layer_names = list(
            vllm_config.compilation_config.static_forward_context.keys())

    forward_context = vllm_config.compilation_config.static_forward_context

980
    return {
981
982
983
        layer_name: forward_context[layer_name]
        for layer_name in layer_names
        if isinstance(forward_context[layer_name], layer_type)
984
    }
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014


@config
@dataclass
class SpeechToTextConfig:
    """Configuration for speech-to-text models."""

    sample_rate: float = 16_000
    """Sample rate (Hz) to resample input audio to. Most speech models expect
    16kHz audio input. The input audio will be automatically resampled to this
    rate before processing."""

    max_audio_clip_s: int = 30
    """Maximum duration in seconds for a single audio clip without chunking.
    Audio longer than this will be split into smaller chunks if
    `allow_audio_chunking` evaluates to True, otherwise it will be rejected."""

    overlap_chunk_second: int = 1
    """Overlap duration in seconds between consecutive audio chunks when
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

    min_energy_split_window_size: Optional[int] = 1600
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
    at 16kHz. If None, no chunking will be done."""

    @property
    def allow_audio_chunking(self) -> bool:
1015
        return self.min_energy_split_window_size is not None
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033


def update_config(config: DataclassInstanceT,
                  overrides: dict[str, Any]) -> DataclassInstanceT:
    processed_overrides = {}
    for field_name, value in overrides.items():
        assert hasattr(
            config, field_name), f"{type(config)} has no field `{field_name}`"
        current_value = getattr(config, field_name)
        if is_dataclass(current_value) and not is_dataclass(value):
            assert isinstance(value, dict), (
                f"Overrides to {type(config)}.{field_name} must be a dict"
                f"  or {type(current_value)}, but got {type(value)}")
            value = update_config(
                current_value,  # type: ignore[type-var]
                value)
        processed_overrides[field_name] = value
    return replace(config, **processed_overrides)