Merge tag 'v0.8.4' into v0.8.4-ori

9c4ecf15 · zhuwenwen · bfc2d6f7 · dc1b4a6f · 9c4ecf15 · 9c4ecf15
Commit 9c4ecf15 authored Apr 14, 2025 by zhuwenwen
20 changed files
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                           PagedAttentionMetadata)
+from vllm.logger import init_logger
+logger = init_logger(__name__)
 _PARTITION_SIZE = 512
@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Ipex is not supported yet, it will fall"
+                " back to global attention for long context.")
        if blocksparse_params is not None:
            raise ValueError(
                "IPEX backend does not support block-sparse attention.")

--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
 return spda_o @ W_O
 NOTE: in the actual code, 
-    `kv_b_proj` is [W_UK; W_UV] concatnated per head
+    `kv_b_proj` is [W_UK; W_UV] concatenated per head
-    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `q_b_proj` is [W_UQ; W_QR] concatenated per head
    `out_proj` is W_O
@@ -205,6 +205,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                           compute_slot_mapping_start_idx,
                                           is_block_tables_empty)
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearBase, RowParallelLinear,
                                               UnquantizedLinearMethod)
@@ -218,9 +219,7 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 if HAS_TRITON:
    from vllm.attention.ops.triton_flash_attention import triton_attention
-    from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 else:
-    merge_attn_states = None
    triton_attention = None
 try:
@@ -668,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
            assert num_seqs > num_queries
        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
            # decodes are scheduled together. In the first step, all the
            # prefills turn into decodes. This update reflects that
            # conversion.

--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                              AttentionMetadata, AttentionType,
                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.logger import init_logger
+logger = init_logger(__name__)
 class PallasAttentionBackend(AttentionBackend):
@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Pallas is not supported yet, it will fall back "
+                "to global attention for long context.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)

--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -462,11 +462,19 @@ class ROCmFlashAttentionImpl(AttentionImpl):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in ROCm Flash Attention is not supported yet, it "
+                "will fail back to global attention for long context.")
        if blocksparse_params is not None:
            raise ValueError(
                "ROCmFlashAttention does not support blocksparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
        if logits_soft_cap is None:
            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
            self.logits_soft_cap = 0.0

--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
        if blocksparse_params is not None:
            raise ValueError(
@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
        if logits_soft_cap is not None:
            logger.warning_once("Torch SPDA does not support logits soft cap. "
                                "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in Torch SPDA is not supported yet, it will fall"
+                " back to global attention for long context.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)

--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
        if blocksparse_params is not None:
            raise ValueError(
@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
        if logits_soft_cap is not None:
            logger.warning_once("XFormers does not support logits soft cap. "
                                "Outputs may be slightly off.")
+        if use_irope:
+            logger.warning_once(
+                "Using irope in XFormers is not supported yet, it will fall"
+                " back to global attention for long context.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
@@ -409,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        if head_size not in supported_head_sizes:
            raise ValueError(
                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
        self.attn_type = attn_type

--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import torch
+from vllm.platforms import current_platform
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
+    # is not support for FP8 dtype, fallback to use Triton kernel.
+    def supported_dtypes(o: torch.Tensor) -> bool:
+        return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+    # NOTE(DefTruth): Currently, custom merge_attn_states CUDA
+    # kernel load/store 128b(16 bytes) per memory issue within
+    # thread. Namely, the headsize(headdim) must be multiple of
+    # pack_size (float32 -> 4, half/bfloat16 -> 8).
+    def supported_headdim(o: torch.Tensor) -> bool:
+        headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        if o.dtype == torch.float32:
+            return headdim % 4 == 0
+        return headdim % 8 == 0
+    if (current_platform.is_cuda() and supported_dtypes(output)
+            and supported_headdim(output)):
+        from vllm._custom_ops import merge_attn_states
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
+    else:
+        from vllm.attention.ops.triton_merge_attn_states import (
+            merge_attn_states)
+        return merge_attn_states(output, prefix_output, prefix_lse,
+                                 suffix_output, suffix_lse, output_lse)
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -446,7 +446,7 @@ def flash_paged_attention(
    IO tensor dtypes:
      - This kernel assumes all IO tensors have the same dtype except for
        block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be
+      - If mixed_precision is True, then all Tensor Engine operation will be
        performed in bfloat16 and accumulation will be performed in float32.
        Otherwise the intermediates will be in the same type as the inputs.

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Use \"--percentile-metrics\" to select metrics.",
    )

--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -2,7 +2,6 @@
 import contextlib
 import copy
 import hashlib
-import importlib.metadata
 import os
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -11,9 +10,9 @@ from unittest.mock import patch
 import torch
 import torch._inductor.compile_fx
 import torch.fx as fx
-from packaging.version import Version
 from vllm.config import VllmConfig
+from vllm.utils import is_torch_equal_or_newer
 class CompilerInterface:
@@ -379,7 +378,7 @@ class InductorAdaptor(CompilerInterface):
        manually setting up internal contexts. But we also rely on non-public
        APIs which might not provide these guarantees.
        """
-        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+        if is_torch_equal_or_newer("2.6"):
            import torch._dynamo.utils
            return torch._dynamo.utils.get_metrics_context()
        else:

--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
 # SPDX-License-Identifier: Apache-2.0
 import hashlib
-import importlib.metadata
 import inspect
 import json
 import types
 from typing import Any, Callable, Dict, Optional, Union
 import torch
-from packaging.version import Version
 from torch import fx
-if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+from vllm.utils import is_torch_equal_or_newer
+if is_torch_equal_or_newer("2.6"):
    from torch._inductor.custom_graph_pass import CustomGraphPass
 else:
    # CustomGraphPass is not present in 2.5 or lower, import our version

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,21 +4,22 @@ import ast
 import copy
 import enum
 import hashlib
-import importlib.metadata
+import inspect
 import json
 import sys
+import textwrap
 import warnings
 from collections import Counter
 from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import dataclass, field, replace
+from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
+                         replace)
 from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
-                    Optional, Protocol, Union)
+                    Optional, Protocol, TypeVar, Union)
 import torch
-from packaging.version import Version
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
@@ -40,10 +41,11 @@ from vllm.transformers_utils.config import (
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, get_open_port, random_uuid,
+                        get_cpu_memory, get_open_port, is_torch_equal_or_newer,
-                        resolve_obj_by_qualname)
+                        random_uuid, resolve_obj_by_qualname)
 if TYPE_CHECKING:
+    from _typeshed import DataclassInstance
    from ray.util.placement_group import PlacementGroup
    from vllm.executor.executor_base import ExecutorBase
@@ -52,8 +54,11 @@ if TYPE_CHECKING:
    from vllm.model_executor.model_loader.loader import BaseModelLoader
    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
        BaseTokenizerGroup)
+    Config = TypeVar("Config", bound=DataclassInstance)
 else:
    QuantizationConfig = None
+    Config = TypeVar("Config")
 logger = init_logger(__name__)
@@ -106,6 +111,77 @@ class ModelImpl(str, enum.Enum):
    TRANSFORMERS = "transformers"
+def get_attr_docs(cls: type[Any]) -> dict[str, str]:
+    """
+    Get any docstrings placed after attribute assignments in a class body.
+    https://davidism.com/mit-license/
+    """
+    def pairwise(iterable):
+        """
+        Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
+        Can be removed when Python 3.9 support is dropped.
+        """
+        iterator = iter(iterable)
+        a = next(iterator, None)
+        for b in iterator:
+            yield a, b
+            a = b
+    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    if not isinstance(cls_node, ast.ClassDef):
+        raise TypeError("Given object was not a class.")
+    out = {}
+    # Consider each pair of nodes.
+    for a, b in pairwise(cls_node.body):
+        # Must be an assignment then a constant string.
+        if (not isinstance(a, (ast.Assign, ast.AnnAssign))
+                or not isinstance(b, ast.Expr)
+                or not isinstance(b.value, ast.Constant)
+                or not isinstance(b.value.value, str)):
+            continue
+        doc = inspect.cleandoc(b.value.value)
+        # An assignment can have multiple targets (a = b = v), but an
+        # annotated assignment only has one target.
+        targets = a.targets if isinstance(a, ast.Assign) else [a.target]
+        for target in targets:
+            # Must be assigning to a plain name.
+            if not isinstance(target, ast.Name):
+                continue
+            out[target.id] = doc
+    return out
+def config(cls: type[Config]) -> type[Config]:
+    """
+    A decorator that ensures all fields in a dataclass have default values
+    and that each field has a docstring.
+    """
+    if not is_dataclass(cls):
+        raise TypeError("The decorated class must be a dataclass.")
+    attr_docs = get_attr_docs(cls)
+    for f in fields(cls):
+        if f.init and f.default is MISSING and f.default_factory is MISSING:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a default value."
+            )
+        if f.name not in attr_docs:
+            raise ValueError(
+                f"Field '{f.name}' in {cls.__name__} must have a docstring.")
+    return cls
 class ModelConfig:
    """Configuration for the model.
@@ -173,6 +249,9 @@ class ModelConfig:
            Defaults to True.
        config_format: The config format which shall be loaded.
            Defaults to 'auto' which defaults to 'hf'.
+        hf_token: The token to use as HTTP bearer authorization for remote files
+            . If `True`, will use the token generated when running 
+            `huggingface-cli login` (stored in `~/.huggingface`).
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
@@ -256,6 +335,7 @@ class ModelConfig:
        limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
        use_async_output_proc: bool = True,
        config_format: ConfigFormat = ConfigFormat.AUTO,
+        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        disable_mm_preprocessor_cache: bool = False,
@@ -358,7 +438,7 @@ class ModelConfig:
                                            "attention_chunk_size", None)
        self.encoder_config = self._get_encoder_config()
        self.hf_image_processor_config = get_hf_image_processor_config(
-            self.model, revision)
+            self.model, hf_token=hf_token, revision=revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.use_async_output_proc = use_async_output_proc
        self.mm_processor_kwargs = mm_processor_kwargs
@@ -503,6 +583,15 @@ class ModelConfig:
                    if getattr(user_config, k) is None:
                        setattr(user_config, k, v)
+            if self.is_matryoshka:
+                if user_config.normalize is None:
+                    user_config.normalize = True
+                elif not user_config.normalize:
+                    raise ValueError(
+                        "`normalize` must be enabled (set to True) "
+                        "for models that are compatible with "
+                        "Matryoshka Representation.")
            return user_config
        return None
@@ -1126,6 +1215,11 @@ class ModelConfig:
        architectures = getattr(self.hf_config, "architectures", [])
        return ModelRegistry.is_v1_compatible(architectures)
+    @property
+    def is_matryoshka(self) -> bool:
+        return (hasattr(self.hf_config, "matryoshka_dimensions")
+                or getattr(self.hf_config, "is_matryoshka", False))
 class CacheConfig:
    """Configuration for the KV cache.
@@ -1350,44 +1444,47 @@ class LoadFormat(str, enum.Enum):
    FASTSAFETENSORS = "fastsafetensors"
+@config
 @dataclass
 class LoadConfig:
-    """
+    """Configuration for loading the model weights."""
-        download_dir: Directory to download and load the weights, default to the
-            default cache directory of huggingface.
+    load_format: Union[str, LoadFormat,
-        load_format: The format of the model weights to load:
+                       "BaseModelLoader"] = LoadFormat.AUTO.value
-            "auto" will try to load the weights in the safetensors format and
+    """The format of the model weights to load:\n
-                fall back to the pytorch bin format if safetensors format is
+    - "auto" will try to load the weights in the safetensors format and fall
-                not available.
+    back to the pytorch bin format if safetensors format is not available.\n
-            "pt" will load the weights in the pytorch bin format.
+    - "pt" will load the weights in the pytorch bin format.\n
-            "safetensors" will load the weights in the safetensors format.
+    - "safetensors" will load the weights in the safetensors format.\n
-            "npcache" will load the weights in pytorch format and store
+    - "npcache" will load the weights in pytorch format and store a numpy cache
-                a numpy cache to speed up the loading.
+    to speed up the loading.\n
-            "dummy" will initialize the weights with random values, which is
+    - "dummy" will initialize the weights with random values, which is mainly
-                mainly for profiling.
+    for profiling.\n
-            "tensorizer" will use CoreWeave's tensorizer library for
+    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
-                fast weight loading.
+    loading. See the Tensorize vLLM Model script in the Examples section for
-            "bitsandbytes" will load nf4 type weights.
+    more information.\n
-            "sharded_state" will load weights from pre-sharded checkpoint files,
+    - "runai_streamer" will load the Safetensors weights using Run:ai Model
-                supporting efficient loading of tensor-parallel models.
+    Streamer.\n
-            "gguf" will load weights from GGUF format files.
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
-            "mistral" will load weights from consolidated safetensors files used
+    - "sharded_state" will load weights from pre-sharded checkpoint files,
-                by Mistral models.
+    supporting efficient loading of tensor-parallel models.\n
-            "runai_streamer" will load weights from RunAI streamer format files.
+    - "gguf" will load weights from GGUF format files (details specified in
-        model_loader_extra_config: The extra config for the model loader.
+    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
-        ignore_patterns: The list of patterns to ignore when loading the model.
+    - "mistral" will load weights from consolidated safetensors files used by
-            Default to "original/**/*" to avoid repeated loading of llama's
+    Mistral models."""
-            checkpoints.
-        use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
-            loading. Default to True
-    """
-    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
    download_dir: Optional[str] = None
-    model_loader_extra_config: Optional[Union[str, dict]] = field(
+    """Directory to download and load the weights, default to the default
-        default_factory=dict)
+    cache directory of Hugging Face."""
+    model_loader_extra_config: Optional[Union[str, dict]] = None
+    """Extra config for model loader. This will be passed to the model loader
+    corresponding to the chosen load_format. This should be a JSON string that
+    will be parsed into a dictionary."""
    ignore_patterns: Optional[Union[list[str], str]] = None
+    """The list of patterns to ignore when loading the model. Default to
+    "original/**/*" to avoid repeated loading of llama's checkpoints."""
    use_tqdm_on_load: bool = True
+    """Whether to enable tqdm for showing progress bar when loading model
+    weights."""
    def compute_hash(self) -> str:
        """
@@ -1425,61 +1522,77 @@ class LoadConfig:
            self.ignore_patterns = ["original/**/*"]
+@config
 @dataclass
 class ParallelConfig:
    """Configuration for the distributed execution."""
-    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
+    pipeline_parallel_size: int = 1
-    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
+    """Number of pipeline parallel groups."""
-    data_parallel_size: int = 1  # Number of data parallel groups.
+    tensor_parallel_size: int = 1
-    data_parallel_rank: int = 0  # Rank of the data parallel group.
+    """Number of tensor parallel groups."""
-    # Local rank of the data parallel group, defaults to global rank.
+    data_parallel_size: int = 1
+    """Number of data parallel groups. MoE layers will be sharded according to
+    the product of the tensor parallel size and data parallel size."""
+    data_parallel_rank: int = 0
+    """Rank of the data parallel group."""
    data_parallel_rank_local: Optional[int] = None
-    # IP of the data parallel master.
+    """Local rank of the data parallel group, defaults to global rank."""
    data_parallel_master_ip: str = "127.0.0.1"
-    data_parallel_master_port: int = 29500  # Port of the data parallel master.
+    """IP of the data parallel master."""
-    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
+    data_parallel_master_port: int = 29500
+    """Port of the data parallel master."""
+    enable_expert_parallel: bool = False
+    """Use expert parallelism instead of tensor parallelism for MoE layers."""
-    # Maximum number of multiple batches
-    # when load model sequentially. To avoid RAM OOM when using tensor
-    # parallel and large models.
    max_parallel_loading_workers: Optional[int] = None
+    """Maximum number of parallal loading workers when loading model
+    sequentially in multiple batches. To avoid RAM OOM when using tensor
+    parallel and large models."""
-    # Disable the custom all-reduce kernel and fall back to NCCL.
    disable_custom_all_reduce: bool = False
+    """Disable the custom all-reduce kernel and fall back to NCCL."""
-    # Config for the tokenizer pool. If None, will use synchronous tokenization.
    tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
+    """Config for the tokenizer pool. If None, will use synchronous
+    tokenization."""
-    # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
    ray_workers_use_nsight: bool = False
+    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
-    # ray distributed model workers placement group.
    placement_group: Optional["PlacementGroup"] = None
+    """ray distributed model workers placement group."""
-    # Backend to use for distributed model
-    # workers, either "ray" or "mp" (multiprocessing). If the product
-    # of pipeline_parallel_size and tensor_parallel_size is less than
-    # or equal to the number of GPUs available, "mp" will be used to
-    # keep processing on a single host. Otherwise, this will default
-    # to "ray" if Ray is installed and fail otherwise. Note that tpu
-    # and hpu only support Ray for distributed inference.
    distributed_executor_backend: Optional[Union[str,
                                                 type["ExecutorBase"]]] = None
+    """Backend to use for distributed model
+    workers, either "ray" or "mp" (multiprocessing). If the product
+    of pipeline_parallel_size and tensor_parallel_size is less than
+    or equal to the number of GPUs available, "mp" will be used to
+    keep processing on a single host. Otherwise, this will default
+    to "ray" if Ray is installed and fail otherwise. Note that tpu
+    and hpu only support Ray for distributed inference."""
-    # the full name of the worker class to use. If "auto", the worker class
-    # will be determined based on the platform.
    worker_cls: str = "auto"
+    """The full name of the worker class to use. If "auto", the worker class
+    will be determined based on the platform."""
    sd_worker_cls: str = "auto"
+    """The full name of the worker class to use for speculative decofing. 
+    If "auto", the worker class will be determined based on the platform."""
    worker_extension_cls: str = ""
+    """The full name of the worker extension class to use. The worker extension
+    class is dynamically inherited by the worker class. This is used to inject
+    new attributes and methods to the worker class for use in collective_rpc
+    calls."""
-    # world_size is TPxPP, it affects the number of workers we create.
    world_size: int = field(init=False)
-    # world_size_across_dp is TPxPPxDP, it is the size of the world
+    """world_size is TPxPP, it affects the number of workers we create."""
-    # including data parallelism.
    world_size_across_dp: int = field(init=False)
+    """world_size_across_dp is TPxPPxDP, it is the size of the world
+    including data parallelism."""
    rank: int = 0
+    """Global rank in distributed setup."""
    def get_next_dp_init_port(self) -> int:
        """
@@ -1717,6 +1830,14 @@ class SchedulerConfig:
    chunked_prefill_enabled: bool = field(init=False)
+    # If set to true and chunked prefill is enabled, we do not want to
+    # partially schedule a multimodal item. Only used in V1
+    # This ensures that if a request has a mixed prompt
+    # (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
+    # some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
+    # it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
+    disable_chunked_mm_input: bool = False
    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
    # or "mod.custom_class".
    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
@@ -2468,6 +2589,11 @@ class LoRAConfig:
            logger.warning("LoRA with chunked prefill is still experimental "
                           "and may be unstable.")
+    def verify_lora_support(self):
+        if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
+            raise ValueError(
+                "V1 LoRA does not support long LoRA, please use V0.")
 @dataclass
 class PromptAdapterConfig:
@@ -2541,14 +2667,20 @@ class MultiModalConfig:
                               usedforsecurity=False).hexdigest()
        return hash_str
+    def get_default_limit_per_prompt(self) -> int:
+        """
+        Return the default number of input items allowed per prompt
+        for any modality if not specified by the user.
+        """
+        return 999 if envs.VLLM_USE_V1 else 1
    def get_limit_per_prompt(self, modality: str) -> int:
        """
        Get the maximum number of input items allowed per prompt
        for the given modality.
-        If not set by the user, this defaults to `1`.
        """
-        return self.limit_per_prompt.get(modality, 1)
+        default = self.get_default_limit_per_prompt()
+        return self.limit_per_prompt.get(modality, default)
    # TODO: Add configs to init vision tower or not.
@@ -2871,7 +3003,7 @@ class DecodingConfig:
    # Which guided decoding algo to use.
    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
    reasoning_backend: Optional[str] = None
@@ -2896,7 +3028,7 @@ class DecodingConfig:
    def __post_init__(self):
        v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar'
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
        ]
        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
@@ -3268,7 +3400,7 @@ class CompilationConfig(BaseModel):
        #    and it is not yet a priority. RFC here:
        #    https://github.com/vllm-project/vllm/issues/14703
-        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+        if is_torch_equal_or_newer("2.6"):
            KEY = 'enable_auto_functionalized_v2'
            if KEY not in self.inductor_compile_config:
                self.inductor_compile_config[KEY] = False
@@ -3567,6 +3699,7 @@ class VllmConfig:
            self.lora_config.verify_with_model_config(self.model_config)
            self.lora_config.verify_with_scheduler_config(
                self.scheduler_config)
+            self.lora_config.verify_lora_support()
        if self.prompt_adapter_config:
            self.prompt_adapter_config.verify_with_model_config(
                self.model_config)
@@ -3769,7 +3902,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
    try:
        _current_vllm_config = vllm_config
        yield
-    finally:
+    except Exception:
+        raise
+    else:
        logger.debug("enabled custom ops: %s",
                     vllm_config.compilation_config.enabled_custom_ops)
        logger.debug("disabled custom ops: %s",
@@ -3787,6 +3922,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
                " does not support it. Please open an issue on GitHub"
                " if you want it to be supported.",
                vllm_config.model_config.model)
+    finally:
        _current_vllm_config = old_vllm_config

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -194,9 +194,11 @@ class GroupCoordinator:
        from vllm.platforms import current_platform
-        # TODO: fix it for other platforms
        if current_platform.is_cuda_alike():
            self.device = torch.device(f"cuda:{local_rank}")
+        elif current_platform.is_out_of_tree():
+            self.device = torch.device(
+                f"{current_platform.device_name}:{local_rank}")
        else:
            self.device = torch.device("cpu")

--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -102,10 +102,11 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
        if remaining_layers := num_hidden_layers % pp_size:
            for i in range(2, remaining_layers + 2):
                partitions[-i] += 1
-            logger.info("Hidden layers were unevenly partitioned: %s",
+            logger.info(
-                        ",".join(str(p) for p in partitions))
+                "Hidden layers were unevenly partitioned: [%s]. "
-            logger.info("This can be manually overridden using the "
+                "This can be manually overridden using the "
-                        "VLLM_PP_LAYER_PARTITION environment variable")
+                "VLLM_PP_LAYER_PARTITION environment variable",
+                ",".join(str(p) for p in partitions))
    start_layer = sum(partitions[:pp_rank])
    end_layer = start_layer + partitions[pp_rank]

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,10 +3,11 @@
 import argparse
 import dataclasses
 import json
+import re
 import threading
-from dataclasses import dataclass
+from dataclasses import MISSING, dataclass, fields
 from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
-                    Tuple, Type, Union, cast, get_args)
+                    Tuple, Type, Union, cast, get_args, get_origin)
 import torch
@@ -18,7 +19,7 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                         ModelConfig, ModelImpl, ObservabilityConfig,
                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
                         SchedulerConfig, SpeculativeConfig, TaskOption,
-                         TokenizerPoolConfig, VllmConfig)
+                         TokenizerPoolConfig, VllmConfig, get_attr_docs)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -100,8 +101,8 @@ class EngineArgs:
    tokenizer_mode: str = 'auto'
    trust_remote_code: bool = False
    allowed_local_media_path: str = ""
-    download_dir: Optional[str] = None
+    download_dir: Optional[str] = LoadConfig.download_dir
-    load_format: str = 'auto'
+    load_format: str = LoadConfig.load_format
    config_format: ConfigFormat = ConfigFormat.AUTO
    dtype: str = 'auto'
    kv_cache_dtype: str = 'auto'
@@ -110,14 +111,15 @@ class EngineArgs:
    # Note: Specifying a custom executor backend by passing a class
    # is intended for expert use only. The API may change without
    # notice.
-    distributed_executor_backend: Optional[Union[str,
+    distributed_executor_backend: Optional[Union[
-                                                 Type[ExecutorBase]]] = None
+        str, Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
    # number of P/D disaggregation (or other disaggregation) workers
-    pipeline_parallel_size: int = 1
+    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
-    tensor_parallel_size: int = 1
+    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
-    data_parallel_size: int = 1
+    data_parallel_size: int = ParallelConfig.data_parallel_size
-    enable_expert_parallel: bool = False
+    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
-    max_parallel_loading_workers: Optional[int] = None
+    max_parallel_loading_workers: Optional[
+        int] = ParallelConfig.max_parallel_loading_workers
    block_size: Optional[int] = None
    enable_prefix_caching: Optional[bool] = None
    prefix_caching_hash_algo: str = "builtin"
@@ -138,12 +140,13 @@ class EngineArgs:
    code_revision: Optional[str] = None
    rope_scaling: Optional[Dict[str, Any]] = None
    rope_theta: Optional[float] = None
+    hf_token: Optional[Union[bool, str]] = None
    hf_overrides: Optional[HfOverrides] = None
    tokenizer_revision: Optional[str] = None
    quantization: Optional[str] = None
    enforce_eager: Optional[bool] = None
    max_seq_len_to_capture: int = 8192
-    disable_custom_all_reduce: bool = False
+    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
    tokenizer_pool_size: int = 0
    # Note: Specifying a tokenizer pool by passing a class
    # is intended for expert use only. The API may change without
@@ -168,17 +171,20 @@ class EngineArgs:
    device: str = 'auto'
    num_scheduler_steps: int = 1
    multi_step_stream_outputs: bool = True
-    ray_workers_use_nsight: bool = False
+    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: Optional[int] = None
    num_lookahead_slots: int = 0
-    model_loader_extra_config: Optional[dict] = None
+    model_loader_extra_config: Optional[
-    ignore_patterns: Optional[Union[str, List[str]]] = None
+        dict] = LoadConfig.model_loader_extra_config
+    ignore_patterns: Optional[Union[str,
+                                    List[str]]] = LoadConfig.ignore_patterns
    preemption_mode: Optional[str] = None
    scheduler_delay_factor: float = 0.0
    enable_chunked_prefill: Optional[bool] = None
+    disable_chunked_mm_input: bool = False
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
    logits_processor_pattern: Optional[str] = None
    speculative_config: Optional[Dict[str, Any]] = None
@@ -194,8 +200,8 @@ class EngineArgs:
    override_neuron_config: Optional[Dict[str, Any]] = None
    override_pooler_config: Optional[PoolerConfig] = None
    compilation_config: Optional[CompilationConfig] = None
-    worker_cls: str = "auto"
+    worker_cls: str = ParallelConfig.worker_cls
-    worker_extension_cls: str = ""
+    worker_extension_cls: str = ParallelConfig.worker_extension_cls
    kv_transfer_config: Optional[KVTransferConfig] = None
@@ -209,7 +215,7 @@ class EngineArgs:
    additional_config: Optional[Dict[str, Any]] = None
    enable_reasoning: Optional[bool] = None
    reasoning_parser: Optional[str] = None
-    use_tqdm_on_load: bool = True
+    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
    def __post_init__(self):
        if not self.tokenizer:
@@ -229,6 +235,39 @@ class EngineArgs:
    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Shared CLI arguments for vLLM engine."""
+        def is_type_in_union(cls: type[Any], type: type[Any]) -> bool:
+            """Check if the class is a type in a union type."""
+            return get_origin(cls) is Union and type in get_args(cls)
+        def is_optional(cls: type[Any]) -> bool:
+            """Check if the class is an optional type."""
+            return is_type_in_union(cls, type(None))
+        def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
+            cls_docs = get_attr_docs(cls)
+            kwargs = {}
+            for field in fields(cls):
+                name = field.name
+                # One of these will always be present
+                default = (field.default_factory
+                           if field.default is MISSING else field.default)
+                kwargs[name] = {"default": default, "help": cls_docs[name]}
+                # When using action="store_true"
+                # add_argument doesn't accept type
+                if field.type is bool:
+                    continue
+                # Handle optional fields
+                if is_optional(field.type):
+                    kwargs[name]["type"] = nullable_str
+                    continue
+                # Handle str in union fields
+                if is_type_in_union(field.type, str):
+                    kwargs[name]["type"] = str
+                    continue
+                kwargs[name]["type"] = field.type
+            return kwargs
        # Model arguments
        parser.add_argument(
            '--model',
@@ -304,38 +343,23 @@ class EngineArgs:
            "from directories specified by the server file system. "
            "This is a security risk. "
            "Should only be enabled in trusted environments.")
-        parser.add_argument('--download-dir',
+        # Model loading arguments
-                            type=nullable_str,
+        load_kwargs = get_kwargs(LoadConfig)
-                            default=EngineArgs.download_dir,
+        load_group = parser.add_argument_group(
-                            help='Directory to download and load the weights.')
+            title="LoadConfig",
-        parser.add_argument(
+            description=LoadConfig.__doc__,
-            '--load-format',
+        )
-            type=str,
+        load_group.add_argument('--load-format',
-            default=EngineArgs.load_format,
+                                choices=[f.value for f in LoadFormat],
-            choices=[f.value for f in LoadFormat],
+                                **load_kwargs["load_format"])
-            help='The format of the model weights to load.\n\n'
+        load_group.add_argument('--download-dir',
-            '* "auto" will try to load the weights in the safetensors format '
+                                **load_kwargs["download_dir"])
-            'and fall back to the pytorch bin format if safetensors format '
+        load_group.add_argument('--model-loader-extra-config',
-            'is not available.\n'
+                                **load_kwargs["model_loader_extra_config"])
-            '* "pt" will load the weights in the pytorch bin format.\n'
+        load_group.add_argument('--use-tqdm-on-load',
-            '* "safetensors" will load the weights in the safetensors format.\n'
+                                action=argparse.BooleanOptionalAction,
-            '* "npcache" will load the weights in pytorch format and store '
+                                **load_kwargs["use_tqdm_on_load"])
-            'a numpy cache to speed up the loading.\n'
-            '* "dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.\n'
-            '* "tensorizer" will load the weights using tensorizer from '
-            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
-            'section for more information.\n'
-            '* "runai_streamer" will load the Safetensors weights using Run:ai'
-            'Model Streamer.\n'
-            '* "bitsandbytes" will load the weights using bitsandbytes '
-            'quantization.\n'
-            '* "sharded_state" will load weights from pre-sharded checkpoint '
-            'files, supporting efficient loading of tensor-parallel models\n'
-            '* "gguf" will load weights from GGUF format files (details '
-            'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
-            '* "mistral" will load weights from consolidated safetensors files '
-            'used by Mistral models.\n')
        parser.add_argument(
            '--config-format',
            default=EngineArgs.config_format,
@@ -367,20 +391,24 @@ class EngineArgs:
            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
        parser.add_argument('--max-model-len',
-                            type=int,
+                            type=human_readable_int,
                            default=EngineArgs.max_model_len,
                            help='Model context length. If unspecified, will '
-                            'be automatically derived from the model config.')
+                            'be automatically derived from the model config. '
+                            'Supports k/m/g/K/M/G in human-readable format.\n'
+                            'Examples:\n'
+                            '- 1k → 1000\n'
+                            '- 1K → 1024\n')
        parser.add_argument(
            '--guided-decoding-backend',
            type=str,
-            default='xgrammar',
+            default=DecodingConfig.guided_decoding_backend,
            help='Which engine will be used for guided decoding'
            ' (JSON schema / regex etc) by default. Currently support '
            'https://github.com/mlc-ai/xgrammar and '
            'https://github.com/guidance-ai/llguidance.'
            'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request'
+            'With "auto", we will make opinionated choices based on request '
            'contents and what the backend libraries currently support, so '
            'the behavior is subject to change in each release.')
        parser.add_argument(
@@ -404,52 +432,37 @@ class EngineArgs:
            '* "transformers" will use the Transformers model '
            'implementation.\n')
        # Parallel arguments
-        parser.add_argument(
+        parallel_kwargs = get_kwargs(ParallelConfig)
+        parallel_group = parser.add_argument_group(
+            title="ParallelConfig",
+            description=ParallelConfig.__doc__,
+        )
+        parallel_group.add_argument(
            '--distributed-executor-backend',
            choices=['ray', 'mp', 'uni', 'external_launcher'],
-            default=EngineArgs.distributed_executor_backend,
+            **parallel_kwargs["distributed_executor_backend"])
-            help='Backend to use for distributed model '
+        parallel_group.add_argument(
-            'workers, either "ray" or "mp" (multiprocessing). If the product '
+            '--pipeline-parallel-size', '-pp',
-            'of pipeline_parallel_size and tensor_parallel_size is less than '
+            **parallel_kwargs["pipeline_parallel_size"])
-            'or equal to the number of GPUs available, "mp" will be used to '
+        parallel_group.add_argument('--tensor-parallel-size', '-tp',
-            'keep processing on a single host. Otherwise, this will default '
+                                    **parallel_kwargs["tensor_parallel_size"])
-            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
+        parallel_group.add_argument('--data-parallel-size', '-dp',
-            'only supports Ray for distributed inference.')
+                                    **parallel_kwargs["data_parallel_size"])
+        parallel_group.add_argument(
-        parser.add_argument('--pipeline-parallel-size',
-                            '-pp',
-                            type=int,
-                            default=EngineArgs.pipeline_parallel_size,
-                            help='Number of pipeline stages.')
-        parser.add_argument('--tensor-parallel-size',
-                            '-tp',
-                            type=int,
-                            default=EngineArgs.tensor_parallel_size,
-                            help='Number of tensor parallel replicas.')
-        parser.add_argument('--data-parallel-size',
-                            '-dp',
-                            type=int,
-                            default=EngineArgs.data_parallel_size,
-                            help='Number of data parallel replicas. '
-                            'MoE layers will be sharded according to the '
-                            'product of the tensor-parallel-size and '
-                            'data-parallel-size.')
-        parser.add_argument(
            '--enable-expert-parallel',
            action='store_true',
-            help='Use expert parallelism instead of tensor parallelism '
+            **parallel_kwargs["enable_expert_parallel"])
-            'for MoE layers.')
+        parallel_group.add_argument(
-        parser.add_argument(
            '--max-parallel-loading-workers',
-            type=int,
+            **parallel_kwargs["max_parallel_loading_workers"])
-            default=EngineArgs.max_parallel_loading_workers,
+        parallel_group.add_argument(
-            help='Load model sequentially in multiple batches, '
-            'to avoid RAM OOM when using tensor '
-            'parallel and large models.')
-        parser.add_argument(
            '--ray-workers-use-nsight',
            action='store_true',
-            help='If specified, use nsight to profile Ray workers.')
+            **parallel_kwargs["ray_workers_use_nsight"])
+        parallel_group.add_argument(
+            '--disable-custom-all-reduce',
+            action='store_true',
+            **parallel_kwargs["disable_custom_all_reduce"])
        # KV cache arguments
        parser.add_argument('--block-size',
                            type=int,
@@ -602,6 +615,16 @@ class EngineArgs:
                            help='RoPE theta. Use with `rope_scaling`. In '
                            'some cases, changing the RoPE theta improves the '
                            'performance of the scaled model.')
+        parser.add_argument(
+            '--hf-token',
+            type=str,
+            nargs='?',
+            const=True,
+            default=None,
+            help='The token to use as HTTP bearer authorization'
+            ' for remote files. If `True`, will use the token '
+            'generated when running `huggingface-cli login` '
+            '(stored in `~/.huggingface`).')
        parser.add_argument('--hf-overrides',
                            type=json.loads,
                            default=EngineArgs.hf_overrides,
@@ -622,10 +645,6 @@ class EngineArgs:
                            'Additionally for encoder-decoder models, if the '
                            'sequence length of the encoder input is larger '
                            'than this, we fall back to the eager mode.')
-        parser.add_argument('--disable-custom-all-reduce',
-                            action='store_true',
-                            default=EngineArgs.disable_custom_all_reduce,
-                            help='See ParallelConfig.')
        parser.add_argument('--tokenizer-pool-size',
                            type=int,
                            default=EngineArgs.tokenizer_pool_size,
@@ -652,13 +671,13 @@ class EngineArgs:
            type=nullable_kvs,
            default=EngineArgs.limit_mm_per_prompt,
            # The default value is given in
-            # MultiModalConfig.get_limit_per_prompt
+            # MultiModalConfig.get_default_limit_per_prompt
            help=('For each multimodal plugin, limit how many '
                  'input instances to allow for each prompt. '
                  'Expects a comma-separated list of items, '
                  'e.g.: `image=16,video=2` allows a maximum of 16 '
-                  'images and 2 videos per prompt. Defaults to 1 for '
+                  'images and 2 videos per prompt. Defaults to '
-                  'each modality.'))
+                  '1 (V0) or 999 (V1) for each modality.'))
        parser.add_argument(
            '--mm-processor-kwargs',
            default=None,
@@ -746,14 +765,6 @@ class EngineArgs:
                            default=1,
                            help=('Maximum number of forward steps per '
                                  'scheduler call.'))
-        parser.add_argument(
-            '--use-tqdm-on-load',
-            dest='use_tqdm_on_load',
-            action=argparse.BooleanOptionalAction,
-            default=EngineArgs.use_tqdm_on_load,
-            help='Whether to enable/disable progress bar '
-            'when loading model weights.',
-        )
        parser.add_argument(
            '--multi-step-stream-outputs',
@@ -782,15 +793,6 @@ class EngineArgs:
                            default=None,
                            help='The configurations for speculative decoding.'
                            ' Should be a JSON string.')
-        parser.add_argument('--model-loader-extra-config',
-                            type=nullable_str,
-                            default=EngineArgs.model_loader_extra_config,
-                            help='Extra config for model loader. '
-                            'This will be passed to the model loader '
-                            'corresponding to the chosen load_format. '
-                            'This should be a JSON string that will be '
-                            'parsed into a dictionary.')
        parser.add_argument(
            '--ignore-patterns',
            action="append",
@@ -1001,6 +1003,20 @@ class EngineArgs:
            "Note that even if this is set to False, cascade attention will be "
            "only used when the heuristic tells that it's beneficial.")
+        parser.add_argument(
+            "--disable-chunked-mm-input",
+            action=StoreBoolean,
+            default=EngineArgs.disable_chunked_mm_input,
+            nargs="?",
+            const="True",
+            help="Disable multimodal input chunking attention for V1. "
+            "If set to true and chunked prefill is enabled, we do not want to"
+            " partially schedule a multimodal item. This ensures that if a "
+            "request has a mixed prompt (like text tokens TTTT followed by "
+            "image tokens IIIIIIIIII) where only some image tokens can be "
+            "scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled "
+            "as TTTT in one step and IIIIIIIIII in the next.")
        return parser
    @classmethod
@@ -1038,6 +1054,7 @@ class EngineArgs:
            code_revision=self.code_revision,
            rope_scaling=self.rope_scaling,
            rope_theta=self.rope_theta,
+            hf_token=self.hf_token,
            hf_overrides=self.hf_overrides,
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
@@ -1244,6 +1261,7 @@ class EngineArgs:
            num_lookahead_slots=num_lookahead_slots,
            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
+            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
            preemption_mode=self.preemption_mode,
            num_scheduler_steps=self.num_scheduler_steps,
@@ -1275,6 +1293,10 @@ class EngineArgs:
            self.model_loader_extra_config[
                "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
+        # bitsandbytes pre-quantized model need a specific model loader
+        if model_config.quantization == "bitsandbytes":
+            self.quantization = self.load_format = "bitsandbytes"
        load_config = self.create_load_config()
        prompt_adapter_config = PromptAdapterConfig(
@@ -1650,12 +1672,14 @@ class EngineArgs:
                UsageContext.LLM_CLASS: 16384,
                UsageContext.OPENAI_API_SERVER: 8192,
            }
+            default_max_num_seqs = 1024
        else:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
                UsageContext.OPENAI_API_SERVER: 2048,
            }
+            default_max_num_seqs = 256
        use_context_value = usage_context.value if usage_context else None
        if (self.max_num_batched_tokens is None
@@ -1666,7 +1690,6 @@ class EngineArgs:
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, use_context_value)
-        default_max_num_seqs = 1024
        if self.max_num_seqs is None:
            self.max_num_seqs = default_max_num_seqs
@@ -1723,6 +1746,47 @@ def _warn_or_fallback(feature_name: str) -> bool:
    return should_exit
+def human_readable_int(value):
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    """
+    value = value.strip()
+    match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
+    if match:
+        decimal_multiplier = {
+            'k': 10**3,
+            'm': 10**6,
+            'g': 10**9,
+        }
+        binary_multiplier = {
+            'K': 2**10,
+            'M': 2**20,
+            'G': 2**30,
+        }
+        number, suffix = match.groups()
+        if suffix in decimal_multiplier:
+            mult = decimal_multiplier[suffix]
+            return int(float(number) * mult)
+        elif suffix in binary_multiplier:
+            mult = binary_multiplier[suffix]
+            # Do not allow decimals with binary multipliers
+            try:
+                return int(number) * mult
+            except ValueError as e:
+                raise argparse.ArgumentTypeError("Decimals are not allowed " \
+                f"with binary suffixes like {suffix}. Did you mean to use " \
+                f"{number}{suffix.lower()} instead?") from e
+    # Regular plain number.
+    return int(value)
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
    return EngineArgs.add_cli_args(FlexibleArgumentParser())

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -8,7 +8,7 @@ from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.logits_processors import (
    get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType)
+                         PromptType, SingletonInputs)
 from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -40,6 +40,7 @@ from vllm.model_executor.guided_decoding import (
    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                          RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
@@ -2029,29 +2030,61 @@ class LLMEngine:
                               lora_request: Optional[LoRARequest]):
        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
-        # For encoder-decoder multimodal models, the max_prompt_len
+        if encoder_inputs is not None:
-        # restricts the decoder prompt length
+            self._validate_model_input(encoder_inputs,
-        if self.model_config.is_multimodal_model:
+                                       lora_request,
-            prompt_inputs = decoder_inputs
+                                       prompt_type="encoder")
-        else:
-            prompt_inputs = encoder_inputs or decoder_inputs
-        prompt_ids = prompt_inputs["prompt_token_ids"]
-        if prompt_ids is None or len(prompt_ids) == 0:
+        self._validate_model_input(decoder_inputs,
-            raise ValueError("Prompt cannot be empty")
+                                   lora_request,
+                                   prompt_type="decoder")
-        if self.model_config.is_multimodal_model:
+    def _validate_model_input(
-            max_prompt_len = self.model_config.max_model_len
+        self,
+        prompt_inputs: SingletonInputs,
+        lora_request: Optional[LoRARequest],
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+        tokenizer = (None if self.tokenizer is None else
+                     self.tokenizer.get_lora_tokenizer(lora_request))
-            if len(prompt_ids) > max_prompt_len:
+        prompt_ids = prompt_inputs["prompt_token_ids"]
-                raise ValueError(
+        if not prompt_ids:
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                    f"to fit into the model (context length {max_prompt_len}). "
+                pass  # Mllama may have empty encoder inputs for text-only data
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+        max_prompt_len = self.model_config.max_model_len
+        if len(prompt_ids) >= max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+            if model_config.is_multimodal_model:
+                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well.")
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens.")
+            raise ValueError(
+                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}")
            # TODO: Find out how many placeholder tokens are there so we can
            # check that chunked prefill does not truncate them

--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -156,7 +156,8 @@ class Metrics:
            labelnames=labelnames,
            buckets=[
                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
            ])
        self.histogram_time_per_output_token = self._histogram_cls(
            name="vllm:time_per_output_token_seconds",
@@ -164,14 +165,14 @@ class Metrics:
            labelnames=labelnames,
            buckets=[
                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
            ])
        # Request stats
        #   Latency
        request_latency_buckets = [
            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
        ]
        self.histogram_e2e_time_request = self._histogram_cls(
            name="vllm:e2e_request_latency_seconds",

--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
            externally (before the next schedule() call)
        """
        # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINSIHED_ABORTED
+        # once scheduled, as a sequence is moved to FINISHED_ABORTED
        # if a client disconnects from the api server.
        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
        if seqs is None:

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -35,7 +35,7 @@ from typing_extensions import Required, TypeAlias, TypedDict
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -452,8 +452,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        self._model_config = model_config
        self._tokenizer = tokenizer
-        self._allowed_items = (model_config.multimodal_config.limit_per_prompt
-                               if model_config.multimodal_config else {})
        self._items_by_modality = defaultdict[str, list[_T]](list)
@@ -465,6 +463,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
    def allowed_local_media_path(self):
        return self._model_config.allowed_local_media_path
+    @property
+    def mm_registry(self):
+        return MULTIMODAL_REGISTRY
    @staticmethod
    @cache
    def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
@@ -487,8 +489,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
            if model_type in ("minicpmo", "minicpmv"):
                return "(<image>./</image>)"
-            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral",
+            if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
-                              "mistral3"):
+                              "pixtral", "mistral3"):
                # These models do not use image tokens in the prompt
                return None
            if model_type == "qwen":
@@ -498,7 +500,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                                              hf_config.image_token_index)
            if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                              "internvl_chat", "skywork_chat", "NVLM_D",
-                              "h2ovl_chat"):
+                              "h2ovl_chat", "idefics3", "smolvlm"):
                return "<image>"
            if model_type in ("mllama", "llama4"):
                return "<|image|>"
@@ -506,8 +508,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
                return "<|vision_start|><|image_pad|><|vision_end|>"
            if model_type == "molmo":
                return ""
-            if model_type == "idefics3":
-                return "<image>"
            if model_type == "aria":
                return "<|fim_prefix|><|img|><|fim_suffix|>"
            if model_type == "gemma3":
@@ -542,12 +542,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.
        """
-        allowed_count = self._allowed_items.get(modality, 1)
+        mm_registry = self.mm_registry
+        model_config = self.model_config
+        input_modality = modality.replace("_embeds", "")
+        if mm_registry.has_processor(model_config):
+            mm_processor = mm_registry.create_processor(model_config)
+            allowed_counts = mm_processor.info.get_allowed_mm_limits()
+            allowed_count = allowed_counts.get(input_modality, 0)
+        else:
+            mm_config = model_config.multimodal_config
+            if mm_config is None:
+                msg = "This model does not support multi-modal inputs"
+                raise ValueError(msg)
+            allowed_count = mm_config.get_limit_per_prompt(input_modality)
        current_count = len(self._items_by_modality[modality]) + 1
        if current_count > allowed_count:
            raise ValueError(
                f"At most {allowed_count} {modality}(s) may be provided in "
-                "one request.")
+                "one request. You can set `--limit-mm-per-prompt` to "
+                "increase this limit if the model supports it.")
        self._items_by_modality[modality].append(item)
@@ -874,19 +891,19 @@ MM_PARSER_MAP: dict[
    Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
    "text":
-    lambda part: _TextParser(part).get("text", ""),
+    lambda part: _TextParser(part).get("text", None),
    "image_url":
-    lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
+    lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
    "image_embeds":
-    lambda part: _ImageEmbedsParser(part).get("image_embeds", {}),
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
    "audio_url":
-    lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
    "input_audio":
-    lambda part: _InputAudioParser(part).get("input_audio", {}),
+    lambda part: _InputAudioParser(part).get("input_audio", None),
    "refusal":
-    lambda part: _RefusalParser(part).get("refusal", ""),
+    lambda part: _RefusalParser(part).get("refusal", None),
    "video_url":
-    lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
+    lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
 }
@@ -1005,11 +1022,11 @@ def _parse_chat_message_content_part(
    part_type, content = _parse_chat_message_content_mm_part(part)
    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
-    # content is empty, log a warning and skip
+    # content is None, log a warning and skip
-    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
+    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
        logger.warning(
-            "Skipping multimodal part (type: '%s') "
+            "Skipping multimodal part '%s' (type: '%s') "
-            "with empty / unparsable content.", part_type)
+            "with empty / unparsable content.", part, part_type)
        return None
    if part_type in ("text", "refusal"):
@@ -1195,8 +1212,15 @@ def apply_mistral_chat_template(
        **kwargs,
    )
-    return tokenizer.apply_chat_template(
+    try:
-        messages=messages,
+        return tokenizer.apply_chat_template(
-        tools=tools,
+            messages=messages,
-        **kwargs,
+            tools=tools,
-    )
+            **kwargs,
+        )
+    # mistral-common uses assert statements to stop processing of input
+    # if input does not comply with the expected format.
+    # We convert those assertion errors to ValueErrors so they can be
+    # are properly caught in the preprocessing_input step
+    except AssertionError as e:
+        raise ValueError from e
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -32,6 +32,7 @@ class BenchmarkSubcommandBase(CLISubcommand):
        parser = subparsers.add_parser(
            self.name,
            help=self.help,
+            description=self.help,
            usage=f"vllm bench {self.name} [options]")
        self.add_cli_args(parser)
        return parser