Commit 9c4ecf15 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-ori

parents bfc2d6f7 dc1b4a6f
...@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ...@@ -14,6 +14,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.paged_attn import (PagedAttention, from vllm.attention.ops.paged_attn import (PagedAttention,
PagedAttentionMetadata) PagedAttentionMetadata)
from vllm.logger import init_logger
logger = init_logger(__name__)
_PARTITION_SIZE = 512 _PARTITION_SIZE = 512
...@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): ...@@ -119,7 +122,12 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if use_irope:
logger.warning_once(
"Using irope in Ipex is not supported yet, it will fall"
" back to global attention for long context.")
if blocksparse_params is not None: if blocksparse_params is not None:
raise ValueError( raise ValueError(
"IPEX backend does not support block-sparse attention.") "IPEX backend does not support block-sparse attention.")
......
...@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention( ...@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
return spda_o @ W_O return spda_o @ W_O
NOTE: in the actual code, NOTE: in the actual code,
`kv_b_proj` is [W_UK; W_UV] concatnated per head `kv_b_proj` is [W_UK; W_UV] concatenated per head
`q_b_proj` is [W_UQ; W_QR] concatnated per head `q_b_proj` is [W_UQ; W_QR] concatenated per head
`out_proj` is W_O `out_proj` is W_O
...@@ -205,6 +205,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, ...@@ -205,6 +205,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
compute_slot_mapping_start_idx, compute_slot_mapping_start_idx,
is_block_tables_empty) is_block_tables_empty)
from vllm.attention.ops.merge_attn_states import merge_attn_states
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
LinearBase, RowParallelLinear, LinearBase, RowParallelLinear,
UnquantizedLinearMethod) UnquantizedLinearMethod)
...@@ -218,9 +219,7 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version ...@@ -218,9 +219,7 @@ from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
if HAS_TRITON: if HAS_TRITON:
from vllm.attention.ops.triton_flash_attention import triton_attention from vllm.attention.ops.triton_flash_attention import triton_attention
from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
else: else:
merge_attn_states = None
triton_attention = None triton_attention = None
try: try:
...@@ -668,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata): ...@@ -668,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
assert num_seqs > num_queries assert num_seqs > num_queries
if turn_prefills_into_decodes: if turn_prefills_into_decodes:
# When Mutli-Step is enabled with Chunked-Prefill, prefills and # When Multi-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the # decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that # prefills turn into decodes. This update reflects that
# conversion. # conversion.
......
...@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, ...@@ -11,6 +11,9 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionMetadata, AttentionType, AttentionMetadata, AttentionType,
is_quantized_kv_cache) is_quantized_kv_cache)
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
from vllm.logger import init_logger
logger = init_logger(__name__)
class PallasAttentionBackend(AttentionBackend): class PallasAttentionBackend(AttentionBackend):
...@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl): ...@@ -105,7 +108,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if use_irope:
logger.warning_once(
"Using irope in Pallas is not supported yet, it will fall back "
"to global attention for long context.")
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.scale = float(scale) self.scale = float(scale)
......
...@@ -462,11 +462,19 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -462,11 +462,19 @@ class ROCmFlashAttentionImpl(AttentionImpl):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if use_irope:
logger.warning_once(
"Using irope in ROCm Flash Attention is not supported yet, it "
"will fail back to global attention for long context.")
if blocksparse_params is not None: if blocksparse_params is not None:
raise ValueError( raise ValueError(
"ROCmFlashAttention does not support blocksparse attention.") "ROCmFlashAttention does not support blocksparse attention.")
if use_irope:
logger.warning(
"Using irope in V0 is not supported yet, it will fall back "
"to global attention for long context.")
if logits_soft_cap is None: if logits_soft_cap is None:
# In flash-attn, setting logits_soft_cap as 0 means no soft cap. # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
self.logits_soft_cap = 0.0 self.logits_soft_cap = 0.0
......
...@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): ...@@ -404,6 +404,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if blocksparse_params is not None: if blocksparse_params is not None:
raise ValueError( raise ValueError(
...@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): ...@@ -411,6 +412,10 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
if logits_soft_cap is not None: if logits_soft_cap is not None:
logger.warning_once("Torch SPDA does not support logits soft cap. " logger.warning_once("Torch SPDA does not support logits soft cap. "
"Outputs may be slightly off.") "Outputs may be slightly off.")
if use_irope:
logger.warning_once(
"Using irope in Torch SPDA is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.scale = float(scale) self.scale = float(scale)
......
...@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): ...@@ -389,6 +389,7 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
blocksparse_params: Optional[Dict[str, Any]] = None, blocksparse_params: Optional[Dict[str, Any]] = None,
logits_soft_cap: Optional[float] = None, logits_soft_cap: Optional[float] = None,
attn_type: str = AttentionType.DECODER, attn_type: str = AttentionType.DECODER,
use_irope: bool = False,
) -> None: ) -> None:
if blocksparse_params is not None: if blocksparse_params is not None:
raise ValueError( raise ValueError(
...@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): ...@@ -396,6 +397,10 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
if logits_soft_cap is not None: if logits_soft_cap is not None:
logger.warning_once("XFormers does not support logits soft cap. " logger.warning_once("XFormers does not support logits soft cap. "
"Outputs may be slightly off.") "Outputs may be slightly off.")
if use_irope:
logger.warning_once(
"Using irope in XFormers is not supported yet, it will fall"
" back to global attention for long context.")
self.num_heads = num_heads self.num_heads = num_heads
self.head_size = head_size self.head_size = head_size
self.scale = float(scale) self.scale = float(scale)
...@@ -409,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): ...@@ -409,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
assert self.num_heads % self.num_kv_heads == 0 assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.num_queries_per_kv = self.num_heads // self.num_kv_heads
suppored_head_sizes = PagedAttention.get_supported_head_sizes() supported_head_sizes = PagedAttention.get_supported_head_sizes()
if head_size not in suppored_head_sizes: if head_size not in supported_head_sizes:
raise ValueError( raise ValueError(
f"Head size {head_size} is not supported by PagedAttention. " f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {suppored_head_sizes}.") f"Supported head sizes are: {supported_head_sizes}.")
self.attn_type = attn_type self.attn_type = attn_type
......
# SPDX-License-Identifier: Apache-2.0
from typing import Optional
import torch
from vllm.platforms import current_platform
def merge_attn_states(
output: torch.Tensor,
prefix_output: torch.Tensor,
prefix_lse: torch.Tensor,
suffix_output: torch.Tensor,
suffix_lse: torch.Tensor,
output_lse: Optional[torch.Tensor] = None,
) -> None:
# NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
# is not support for FP8 dtype, fallback to use Triton kernel.
def supported_dtypes(o: torch.Tensor) -> bool:
return o.dtype in [torch.float32, torch.half, torch.bfloat16]
# NOTE(DefTruth): Currently, custom merge_attn_states CUDA
# kernel load/store 128b(16 bytes) per memory issue within
# thread. Namely, the headsize(headdim) must be multiple of
# pack_size (float32 -> 4, half/bfloat16 -> 8).
def supported_headdim(o: torch.Tensor) -> bool:
headdim = o.shape[2] # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
if o.dtype == torch.float32:
return headdim % 4 == 0
return headdim % 8 == 0
if (current_platform.is_cuda() and supported_dtypes(output)
and supported_headdim(output)):
from vllm._custom_ops import merge_attn_states
return merge_attn_states(output, prefix_output, prefix_lse,
suffix_output, suffix_lse, output_lse)
else:
from vllm.attention.ops.triton_merge_attn_states import (
merge_attn_states)
return merge_attn_states(output, prefix_output, prefix_lse,
suffix_output, suffix_lse, output_lse)
...@@ -446,7 +446,7 @@ def flash_paged_attention( ...@@ -446,7 +446,7 @@ def flash_paged_attention(
IO tensor dtypes: IO tensor dtypes:
- This kernel assumes all IO tensors have the same dtype except for - This kernel assumes all IO tensors have the same dtype except for
block_tables (int32) and mask (int32) block_tables (int32) and mask (int32)
- If mixed_percision is True, then all Tensor Engine operation will be - If mixed_precision is True, then all Tensor Engine operation will be
performed in bfloat16 and accumulation will be performed in float32. performed in bfloat16 and accumulation will be performed in float32.
Otherwise the intermediates will be in the same type as the inputs. Otherwise the intermediates will be in the same type as the inputs.
......
...@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
default="ttft,tpot,itl", default="ttft,tpot,itl",
help="Comma-seperated list of selected metrics to report percentils. " help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ") "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
parser.add_argument( parser.add_argument(
"--metric-percentiles", "--metric-percentiles",
type=str, type=str,
default="99", default="99",
help="Comma-seperated list of percentiles for selected metrics. " help="Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Use \"--percentile-metrics\" to select metrics.", "Use \"--percentile-metrics\" to select metrics.",
) )
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
import contextlib import contextlib
import copy import copy
import hashlib import hashlib
import importlib.metadata
import os import os
from contextlib import ExitStack from contextlib import ExitStack
from typing import Any, Callable, Dict, List, Optional, Tuple from typing import Any, Callable, Dict, List, Optional, Tuple
...@@ -11,9 +10,9 @@ from unittest.mock import patch ...@@ -11,9 +10,9 @@ from unittest.mock import patch
import torch import torch
import torch._inductor.compile_fx import torch._inductor.compile_fx
import torch.fx as fx import torch.fx as fx
from packaging.version import Version
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import is_torch_equal_or_newer
class CompilerInterface: class CompilerInterface:
...@@ -379,7 +378,7 @@ class InductorAdaptor(CompilerInterface): ...@@ -379,7 +378,7 @@ class InductorAdaptor(CompilerInterface):
manually setting up internal contexts. But we also rely on non-public manually setting up internal contexts. But we also rely on non-public
APIs which might not provide these guarantees. APIs which might not provide these guarantees.
""" """
if Version(importlib.metadata.version('torch')) >= Version("2.6"): if is_torch_equal_or_newer("2.6"):
import torch._dynamo.utils import torch._dynamo.utils
return torch._dynamo.utils.get_metrics_context() return torch._dynamo.utils.get_metrics_context()
else: else:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import hashlib import hashlib
import importlib.metadata
import inspect import inspect
import json import json
import types import types
from typing import Any, Callable, Dict, Optional, Union from typing import Any, Callable, Dict, Optional, Union
import torch import torch
from packaging.version import Version
from torch import fx from torch import fx
if Version(importlib.metadata.version('torch')) >= Version("2.6"): from vllm.utils import is_torch_equal_or_newer
if is_torch_equal_or_newer("2.6"):
from torch._inductor.custom_graph_pass import CustomGraphPass from torch._inductor.custom_graph_pass import CustomGraphPass
else: else:
# CustomGraphPass is not present in 2.5 or lower, import our version # CustomGraphPass is not present in 2.5 or lower, import our version
......
...@@ -4,21 +4,22 @@ import ast ...@@ -4,21 +4,22 @@ import ast
import copy import copy
import enum import enum
import hashlib import hashlib
import importlib.metadata import inspect
import json import json
import sys import sys
import textwrap
import warnings import warnings
from collections import Counter from collections import Counter
from collections.abc import Mapping from collections.abc import Mapping
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass, field, replace from dataclasses import (MISSING, dataclass, field, fields, is_dataclass,
replace)
from importlib.util import find_spec from importlib.util import find_spec
from pathlib import Path from pathlib import Path
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal, from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
Optional, Protocol, Union) Optional, Protocol, TypeVar, Union)
import torch import torch
from packaging.version import Version
from pydantic import BaseModel, Field, PrivateAttr from pydantic import BaseModel, Field, PrivateAttr
from torch.distributed import ProcessGroup, ReduceOp from torch.distributed import ProcessGroup, ReduceOp
from transformers import PretrainedConfig from transformers import PretrainedConfig
...@@ -40,10 +41,11 @@ from vllm.transformers_utils.config import ( ...@@ -40,10 +41,11 @@ from vllm.transformers_utils.config import (
from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
get_cpu_memory, get_open_port, random_uuid, get_cpu_memory, get_open_port, is_torch_equal_or_newer,
resolve_obj_by_qualname) random_uuid, resolve_obj_by_qualname)
if TYPE_CHECKING: if TYPE_CHECKING:
from _typeshed import DataclassInstance
from ray.util.placement_group import PlacementGroup from ray.util.placement_group import PlacementGroup
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
...@@ -52,8 +54,11 @@ if TYPE_CHECKING: ...@@ -52,8 +54,11 @@ if TYPE_CHECKING:
from vllm.model_executor.model_loader.loader import BaseModelLoader from vllm.model_executor.model_loader.loader import BaseModelLoader
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup) BaseTokenizerGroup)
Config = TypeVar("Config", bound=DataclassInstance)
else: else:
QuantizationConfig = None QuantizationConfig = None
Config = TypeVar("Config")
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -106,6 +111,77 @@ class ModelImpl(str, enum.Enum): ...@@ -106,6 +111,77 @@ class ModelImpl(str, enum.Enum):
TRANSFORMERS = "transformers" TRANSFORMERS = "transformers"
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
"""
Get any docstrings placed after attribute assignments in a class body.
https://davidism.com/mit-license/
"""
def pairwise(iterable):
"""
Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise
Can be removed when Python 3.9 support is dropped.
"""
iterator = iter(iterable)
a = next(iterator, None)
for b in iterator:
yield a, b
a = b
cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
if not isinstance(cls_node, ast.ClassDef):
raise TypeError("Given object was not a class.")
out = {}
# Consider each pair of nodes.
for a, b in pairwise(cls_node.body):
# Must be an assignment then a constant string.
if (not isinstance(a, (ast.Assign, ast.AnnAssign))
or not isinstance(b, ast.Expr)
or not isinstance(b.value, ast.Constant)
or not isinstance(b.value.value, str)):
continue
doc = inspect.cleandoc(b.value.value)
# An assignment can have multiple targets (a = b = v), but an
# annotated assignment only has one target.
targets = a.targets if isinstance(a, ast.Assign) else [a.target]
for target in targets:
# Must be assigning to a plain name.
if not isinstance(target, ast.Name):
continue
out[target.id] = doc
return out
def config(cls: type[Config]) -> type[Config]:
"""
A decorator that ensures all fields in a dataclass have default values
and that each field has a docstring.
"""
if not is_dataclass(cls):
raise TypeError("The decorated class must be a dataclass.")
attr_docs = get_attr_docs(cls)
for f in fields(cls):
if f.init and f.default is MISSING and f.default_factory is MISSING:
raise ValueError(
f"Field '{f.name}' in {cls.__name__} must have a default value."
)
if f.name not in attr_docs:
raise ValueError(
f"Field '{f.name}' in {cls.__name__} must have a docstring.")
return cls
class ModelConfig: class ModelConfig:
"""Configuration for the model. """Configuration for the model.
...@@ -173,6 +249,9 @@ class ModelConfig: ...@@ -173,6 +249,9 @@ class ModelConfig:
Defaults to True. Defaults to True.
config_format: The config format which shall be loaded. config_format: The config format which shall be loaded.
Defaults to 'auto' which defaults to 'hf'. Defaults to 'auto' which defaults to 'hf'.
hf_token: The token to use as HTTP bearer authorization for remote files
. If `True`, will use the token generated when running
`huggingface-cli login` (stored in `~/.huggingface`).
hf_overrides: If a dictionary, contains arguments to be forwarded to the hf_overrides: If a dictionary, contains arguments to be forwarded to the
HuggingFace config. If a callable, it is called to update the HuggingFace config. If a callable, it is called to update the
HuggingFace config. HuggingFace config.
...@@ -256,6 +335,7 @@ class ModelConfig: ...@@ -256,6 +335,7 @@ class ModelConfig:
limit_mm_per_prompt: Optional[Mapping[str, int]] = None, limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
use_async_output_proc: bool = True, use_async_output_proc: bool = True,
config_format: ConfigFormat = ConfigFormat.AUTO, config_format: ConfigFormat = ConfigFormat.AUTO,
hf_token: Optional[Union[bool, str]] = None,
hf_overrides: Optional[HfOverrides] = None, hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None,
disable_mm_preprocessor_cache: bool = False, disable_mm_preprocessor_cache: bool = False,
...@@ -358,7 +438,7 @@ class ModelConfig: ...@@ -358,7 +438,7 @@ class ModelConfig:
"attention_chunk_size", None) "attention_chunk_size", None)
self.encoder_config = self._get_encoder_config() self.encoder_config = self._get_encoder_config()
self.hf_image_processor_config = get_hf_image_processor_config( self.hf_image_processor_config = get_hf_image_processor_config(
self.model, revision) self.model, hf_token=hf_token, revision=revision)
self.dtype = _get_and_verify_dtype(self.hf_config, dtype) self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
self.use_async_output_proc = use_async_output_proc self.use_async_output_proc = use_async_output_proc
self.mm_processor_kwargs = mm_processor_kwargs self.mm_processor_kwargs = mm_processor_kwargs
...@@ -503,6 +583,15 @@ class ModelConfig: ...@@ -503,6 +583,15 @@ class ModelConfig:
if getattr(user_config, k) is None: if getattr(user_config, k) is None:
setattr(user_config, k, v) setattr(user_config, k, v)
if self.is_matryoshka:
if user_config.normalize is None:
user_config.normalize = True
elif not user_config.normalize:
raise ValueError(
"`normalize` must be enabled (set to True) "
"for models that are compatible with "
"Matryoshka Representation.")
return user_config return user_config
return None return None
...@@ -1126,6 +1215,11 @@ class ModelConfig: ...@@ -1126,6 +1215,11 @@ class ModelConfig:
architectures = getattr(self.hf_config, "architectures", []) architectures = getattr(self.hf_config, "architectures", [])
return ModelRegistry.is_v1_compatible(architectures) return ModelRegistry.is_v1_compatible(architectures)
@property
def is_matryoshka(self) -> bool:
return (hasattr(self.hf_config, "matryoshka_dimensions")
or getattr(self.hf_config, "is_matryoshka", False))
class CacheConfig: class CacheConfig:
"""Configuration for the KV cache. """Configuration for the KV cache.
...@@ -1350,44 +1444,47 @@ class LoadFormat(str, enum.Enum): ...@@ -1350,44 +1444,47 @@ class LoadFormat(str, enum.Enum):
FASTSAFETENSORS = "fastsafetensors" FASTSAFETENSORS = "fastsafetensors"
@config
@dataclass @dataclass
class LoadConfig: class LoadConfig:
""" """Configuration for loading the model weights."""
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface. load_format: Union[str, LoadFormat,
load_format: The format of the model weights to load: "BaseModelLoader"] = LoadFormat.AUTO.value
"auto" will try to load the weights in the safetensors format and """The format of the model weights to load:\n
fall back to the pytorch bin format if safetensors format is - "auto" will try to load the weights in the safetensors format and fall
not available. back to the pytorch bin format if safetensors format is not available.\n
"pt" will load the weights in the pytorch bin format. - "pt" will load the weights in the pytorch bin format.\n
"safetensors" will load the weights in the safetensors format. - "safetensors" will load the weights in the safetensors format.\n
"npcache" will load the weights in pytorch format and store - "npcache" will load the weights in pytorch format and store a numpy cache
a numpy cache to speed up the loading. to speed up the loading.\n
"dummy" will initialize the weights with random values, which is - "dummy" will initialize the weights with random values, which is mainly
mainly for profiling. for profiling.\n
"tensorizer" will use CoreWeave's tensorizer library for - "tensorizer" will use CoreWeave's tensorizer library for fast weight
fast weight loading. loading. See the Tensorize vLLM Model script in the Examples section for
"bitsandbytes" will load nf4 type weights. more information.\n
"sharded_state" will load weights from pre-sharded checkpoint files, - "runai_streamer" will load the Safetensors weights using Run:ai Model
supporting efficient loading of tensor-parallel models. Streamer.\n
"gguf" will load weights from GGUF format files. - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
"mistral" will load weights from consolidated safetensors files used - "sharded_state" will load weights from pre-sharded checkpoint files,
by Mistral models. supporting efficient loading of tensor-parallel models.\n
"runai_streamer" will load weights from RunAI streamer format files. - "gguf" will load weights from GGUF format files (details specified in
model_loader_extra_config: The extra config for the model loader. https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
ignore_patterns: The list of patterns to ignore when loading the model. - "mistral" will load weights from consolidated safetensors files used by
Default to "original/**/*" to avoid repeated loading of llama's Mistral models."""
checkpoints.
use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
loading. Default to True
"""
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
download_dir: Optional[str] = None download_dir: Optional[str] = None
model_loader_extra_config: Optional[Union[str, dict]] = field( """Directory to download and load the weights, default to the default
default_factory=dict) cache directory of Hugging Face."""
model_loader_extra_config: Optional[Union[str, dict]] = None
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format. This should be a JSON string that
will be parsed into a dictionary."""
ignore_patterns: Optional[Union[list[str], str]] = None ignore_patterns: Optional[Union[list[str], str]] = None
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
...@@ -1425,61 +1522,77 @@ class LoadConfig: ...@@ -1425,61 +1522,77 @@ class LoadConfig:
self.ignore_patterns = ["original/**/*"] self.ignore_patterns = ["original/**/*"]
@config
@dataclass @dataclass
class ParallelConfig: class ParallelConfig:
"""Configuration for the distributed execution.""" """Configuration for the distributed execution."""
pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. pipeline_parallel_size: int = 1
tensor_parallel_size: int = 1 # Number of tensor parallel groups. """Number of pipeline parallel groups."""
data_parallel_size: int = 1 # Number of data parallel groups. tensor_parallel_size: int = 1
data_parallel_rank: int = 0 # Rank of the data parallel group. """Number of tensor parallel groups."""
# Local rank of the data parallel group, defaults to global rank. data_parallel_size: int = 1
"""Number of data parallel groups. MoE layers will be sharded according to
the product of the tensor parallel size and data parallel size."""
data_parallel_rank: int = 0
"""Rank of the data parallel group."""
data_parallel_rank_local: Optional[int] = None data_parallel_rank_local: Optional[int] = None
# IP of the data parallel master. """Local rank of the data parallel group, defaults to global rank."""
data_parallel_master_ip: str = "127.0.0.1" data_parallel_master_ip: str = "127.0.0.1"
data_parallel_master_port: int = 29500 # Port of the data parallel master. """IP of the data parallel master."""
enable_expert_parallel: bool = False # Use EP instead of TP for MoE layers. data_parallel_master_port: int = 29500
"""Port of the data parallel master."""
enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
# Maximum number of multiple batches
# when load model sequentially. To avoid RAM OOM when using tensor
# parallel and large models.
max_parallel_loading_workers: Optional[int] = None max_parallel_loading_workers: Optional[int] = None
"""Maximum number of parallal loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models."""
# Disable the custom all-reduce kernel and fall back to NCCL.
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = False
"""Disable the custom all-reduce kernel and fall back to NCCL."""
# Config for the tokenizer pool. If None, will use synchronous tokenization.
tokenizer_pool_config: Optional[TokenizerPoolConfig] = None tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
"""Config for the tokenizer pool. If None, will use synchronous
tokenization."""
# Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
ray_workers_use_nsight: bool = False ray_workers_use_nsight: bool = False
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
# ray distributed model workers placement group.
placement_group: Optional["PlacementGroup"] = None placement_group: Optional["PlacementGroup"] = None
"""ray distributed model workers placement group."""
# Backend to use for distributed model
# workers, either "ray" or "mp" (multiprocessing). If the product
# of pipeline_parallel_size and tensor_parallel_size is less than
# or equal to the number of GPUs available, "mp" will be used to
# keep processing on a single host. Otherwise, this will default
# to "ray" if Ray is installed and fail otherwise. Note that tpu
# and hpu only support Ray for distributed inference.
distributed_executor_backend: Optional[Union[str, distributed_executor_backend: Optional[Union[str,
type["ExecutorBase"]]] = None type["ExecutorBase"]]] = None
"""Backend to use for distributed model
workers, either "ray" or "mp" (multiprocessing). If the product
of pipeline_parallel_size and tensor_parallel_size is less than
or equal to the number of GPUs available, "mp" will be used to
keep processing on a single host. Otherwise, this will default
to "ray" if Ray is installed and fail otherwise. Note that tpu
and hpu only support Ray for distributed inference."""
# the full name of the worker class to use. If "auto", the worker class
# will be determined based on the platform.
worker_cls: str = "auto" worker_cls: str = "auto"
"""The full name of the worker class to use. If "auto", the worker class
will be determined based on the platform."""
sd_worker_cls: str = "auto" sd_worker_cls: str = "auto"
"""The full name of the worker class to use for speculative decofing.
If "auto", the worker class will be determined based on the platform."""
worker_extension_cls: str = "" worker_extension_cls: str = ""
"""The full name of the worker extension class to use. The worker extension
class is dynamically inherited by the worker class. This is used to inject
new attributes and methods to the worker class for use in collective_rpc
calls."""
# world_size is TPxPP, it affects the number of workers we create.
world_size: int = field(init=False) world_size: int = field(init=False)
# world_size_across_dp is TPxPPxDP, it is the size of the world """world_size is TPxPP, it affects the number of workers we create."""
# including data parallelism.
world_size_across_dp: int = field(init=False) world_size_across_dp: int = field(init=False)
"""world_size_across_dp is TPxPPxDP, it is the size of the world
including data parallelism."""
rank: int = 0 rank: int = 0
"""Global rank in distributed setup."""
def get_next_dp_init_port(self) -> int: def get_next_dp_init_port(self) -> int:
""" """
...@@ -1717,6 +1830,14 @@ class SchedulerConfig: ...@@ -1717,6 +1830,14 @@ class SchedulerConfig:
chunked_prefill_enabled: bool = field(init=False) chunked_prefill_enabled: bool = field(init=False)
# If set to true and chunked prefill is enabled, we do not want to
# partially schedule a multimodal item. Only used in V1
# This ensures that if a request has a mixed prompt
# (like text tokens TTTT followed by image tokens IIIIIIIIII) where only
# some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),
# it will be scheduled as TTTT in one step and IIIIIIIIII in the next.
disable_chunked_mm_input: bool = False
# scheduler class or path. "vllm.core.scheduler.Scheduler" (default) # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
# or "mod.custom_class". # or "mod.custom_class".
scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler" scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
...@@ -2468,6 +2589,11 @@ class LoRAConfig: ...@@ -2468,6 +2589,11 @@ class LoRAConfig:
logger.warning("LoRA with chunked prefill is still experimental " logger.warning("LoRA with chunked prefill is still experimental "
"and may be unstable.") "and may be unstable.")
def verify_lora_support(self):
if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1:
raise ValueError(
"V1 LoRA does not support long LoRA, please use V0.")
@dataclass @dataclass
class PromptAdapterConfig: class PromptAdapterConfig:
...@@ -2541,14 +2667,20 @@ class MultiModalConfig: ...@@ -2541,14 +2667,20 @@ class MultiModalConfig:
usedforsecurity=False).hexdigest() usedforsecurity=False).hexdigest()
return hash_str return hash_str
def get_default_limit_per_prompt(self) -> int:
"""
Return the default number of input items allowed per prompt
for any modality if not specified by the user.
"""
return 999 if envs.VLLM_USE_V1 else 1
def get_limit_per_prompt(self, modality: str) -> int: def get_limit_per_prompt(self, modality: str) -> int:
""" """
Get the maximum number of input items allowed per prompt Get the maximum number of input items allowed per prompt
for the given modality. for the given modality.
If not set by the user, this defaults to `1`.
""" """
return self.limit_per_prompt.get(modality, 1) default = self.get_default_limit_per_prompt()
return self.limit_per_prompt.get(modality, default)
# TODO: Add configs to init vision tower or not. # TODO: Add configs to init vision tower or not.
...@@ -2871,7 +3003,7 @@ class DecodingConfig: ...@@ -2871,7 +3003,7 @@ class DecodingConfig:
# Which guided decoding algo to use. # Which guided decoding algo to use.
# 'outlines' / 'lm-format-enforcer' / 'xgrammar' # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
guided_decoding_backend: str = 'xgrammar' guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
reasoning_backend: Optional[str] = None reasoning_backend: Optional[str] = None
...@@ -2896,7 +3028,7 @@ class DecodingConfig: ...@@ -2896,7 +3028,7 @@ class DecodingConfig:
def __post_init__(self): def __post_init__(self):
v0_valid_guided_backends = [ v0_valid_guided_backends = [
'outlines', 'lm-format-enforcer', 'xgrammar' 'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
] ]
v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto'] v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
...@@ -3268,7 +3400,7 @@ class CompilationConfig(BaseModel): ...@@ -3268,7 +3400,7 @@ class CompilationConfig(BaseModel):
# and it is not yet a priority. RFC here: # and it is not yet a priority. RFC here:
# https://github.com/vllm-project/vllm/issues/14703 # https://github.com/vllm-project/vllm/issues/14703
if Version(importlib.metadata.version('torch')) >= Version("2.6"): if is_torch_equal_or_newer("2.6"):
KEY = 'enable_auto_functionalized_v2' KEY = 'enable_auto_functionalized_v2'
if KEY not in self.inductor_compile_config: if KEY not in self.inductor_compile_config:
self.inductor_compile_config[KEY] = False self.inductor_compile_config[KEY] = False
...@@ -3567,6 +3699,7 @@ class VllmConfig: ...@@ -3567,6 +3699,7 @@ class VllmConfig:
self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_with_model_config(self.model_config)
self.lora_config.verify_with_scheduler_config( self.lora_config.verify_with_scheduler_config(
self.scheduler_config) self.scheduler_config)
self.lora_config.verify_lora_support()
if self.prompt_adapter_config: if self.prompt_adapter_config:
self.prompt_adapter_config.verify_with_model_config( self.prompt_adapter_config.verify_with_model_config(
self.model_config) self.model_config)
...@@ -3769,7 +3902,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): ...@@ -3769,7 +3902,9 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
try: try:
_current_vllm_config = vllm_config _current_vllm_config = vllm_config
yield yield
finally: except Exception:
raise
else:
logger.debug("enabled custom ops: %s", logger.debug("enabled custom ops: %s",
vllm_config.compilation_config.enabled_custom_ops) vllm_config.compilation_config.enabled_custom_ops)
logger.debug("disabled custom ops: %s", logger.debug("disabled custom ops: %s",
...@@ -3787,6 +3922,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): ...@@ -3787,6 +3922,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
" does not support it. Please open an issue on GitHub" " does not support it. Please open an issue on GitHub"
" if you want it to be supported.", " if you want it to be supported.",
vllm_config.model_config.model) vllm_config.model_config.model)
finally:
_current_vllm_config = old_vllm_config _current_vllm_config = old_vllm_config
......
...@@ -194,9 +194,11 @@ class GroupCoordinator: ...@@ -194,9 +194,11 @@ class GroupCoordinator:
from vllm.platforms import current_platform from vllm.platforms import current_platform
# TODO: fix it for other platforms
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
self.device = torch.device(f"cuda:{local_rank}") self.device = torch.device(f"cuda:{local_rank}")
elif current_platform.is_out_of_tree():
self.device = torch.device(
f"{current_platform.device_name}:{local_rank}")
else: else:
self.device = torch.device("cpu") self.device = torch.device("cpu")
......
...@@ -102,10 +102,11 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int, ...@@ -102,10 +102,11 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
if remaining_layers := num_hidden_layers % pp_size: if remaining_layers := num_hidden_layers % pp_size:
for i in range(2, remaining_layers + 2): for i in range(2, remaining_layers + 2):
partitions[-i] += 1 partitions[-i] += 1
logger.info("Hidden layers were unevenly partitioned: %s", logger.info(
",".join(str(p) for p in partitions)) "Hidden layers were unevenly partitioned: [%s]. "
logger.info("This can be manually overridden using the " "This can be manually overridden using the "
"VLLM_PP_LAYER_PARTITION environment variable") "VLLM_PP_LAYER_PARTITION environment variable",
",".join(str(p) for p in partitions))
start_layer = sum(partitions[:pp_rank]) start_layer = sum(partitions[:pp_rank])
end_layer = start_layer + partitions[pp_rank] end_layer = start_layer + partitions[pp_rank]
......
...@@ -3,10 +3,11 @@ ...@@ -3,10 +3,11 @@
import argparse import argparse
import dataclasses import dataclasses
import json import json
import re
import threading import threading
from dataclasses import dataclass from dataclasses import MISSING, dataclass, fields
from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
Tuple, Type, Union, cast, get_args) Tuple, Type, Union, cast, get_args, get_origin)
import torch import torch
...@@ -18,7 +19,7 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, ...@@ -18,7 +19,7 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
ModelConfig, ModelImpl, ObservabilityConfig, ModelConfig, ModelImpl, ObservabilityConfig,
ParallelConfig, PoolerConfig, PromptAdapterConfig, ParallelConfig, PoolerConfig, PromptAdapterConfig,
SchedulerConfig, SpeculativeConfig, TaskOption, SchedulerConfig, SpeculativeConfig, TaskOption,
TokenizerPoolConfig, VllmConfig) TokenizerPoolConfig, VllmConfig, get_attr_docs)
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
...@@ -100,8 +101,8 @@ class EngineArgs: ...@@ -100,8 +101,8 @@ class EngineArgs:
tokenizer_mode: str = 'auto' tokenizer_mode: str = 'auto'
trust_remote_code: bool = False trust_remote_code: bool = False
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
download_dir: Optional[str] = None download_dir: Optional[str] = LoadConfig.download_dir
load_format: str = 'auto' load_format: str = LoadConfig.load_format
config_format: ConfigFormat = ConfigFormat.AUTO config_format: ConfigFormat = ConfigFormat.AUTO
dtype: str = 'auto' dtype: str = 'auto'
kv_cache_dtype: str = 'auto' kv_cache_dtype: str = 'auto'
...@@ -110,14 +111,15 @@ class EngineArgs: ...@@ -110,14 +111,15 @@ class EngineArgs:
# Note: Specifying a custom executor backend by passing a class # Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without # is intended for expert use only. The API may change without
# notice. # notice.
distributed_executor_backend: Optional[Union[str, distributed_executor_backend: Optional[Union[
Type[ExecutorBase]]] = None str, Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
# number of P/D disaggregation (or other disaggregation) workers # number of P/D disaggregation (or other disaggregation) workers
pipeline_parallel_size: int = 1 pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
tensor_parallel_size: int = 1 tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
data_parallel_size: int = 1 data_parallel_size: int = ParallelConfig.data_parallel_size
enable_expert_parallel: bool = False enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
max_parallel_loading_workers: Optional[int] = None max_parallel_loading_workers: Optional[
int] = ParallelConfig.max_parallel_loading_workers
block_size: Optional[int] = None block_size: Optional[int] = None
enable_prefix_caching: Optional[bool] = None enable_prefix_caching: Optional[bool] = None
prefix_caching_hash_algo: str = "builtin" prefix_caching_hash_algo: str = "builtin"
...@@ -138,12 +140,13 @@ class EngineArgs: ...@@ -138,12 +140,13 @@ class EngineArgs:
code_revision: Optional[str] = None code_revision: Optional[str] = None
rope_scaling: Optional[Dict[str, Any]] = None rope_scaling: Optional[Dict[str, Any]] = None
rope_theta: Optional[float] = None rope_theta: Optional[float] = None
hf_token: Optional[Union[bool, str]] = None
hf_overrides: Optional[HfOverrides] = None hf_overrides: Optional[HfOverrides] = None
tokenizer_revision: Optional[str] = None tokenizer_revision: Optional[str] = None
quantization: Optional[str] = None quantization: Optional[str] = None
enforce_eager: Optional[bool] = None enforce_eager: Optional[bool] = None
max_seq_len_to_capture: int = 8192 max_seq_len_to_capture: int = 8192
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
tokenizer_pool_size: int = 0 tokenizer_pool_size: int = 0
# Note: Specifying a tokenizer pool by passing a class # Note: Specifying a tokenizer pool by passing a class
# is intended for expert use only. The API may change without # is intended for expert use only. The API may change without
...@@ -168,17 +171,20 @@ class EngineArgs: ...@@ -168,17 +171,20 @@ class EngineArgs:
device: str = 'auto' device: str = 'auto'
num_scheduler_steps: int = 1 num_scheduler_steps: int = 1
multi_step_stream_outputs: bool = True multi_step_stream_outputs: bool = True
ray_workers_use_nsight: bool = False ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
num_gpu_blocks_override: Optional[int] = None num_gpu_blocks_override: Optional[int] = None
num_lookahead_slots: int = 0 num_lookahead_slots: int = 0
model_loader_extra_config: Optional[dict] = None model_loader_extra_config: Optional[
ignore_patterns: Optional[Union[str, List[str]]] = None dict] = LoadConfig.model_loader_extra_config
ignore_patterns: Optional[Union[str,
List[str]]] = LoadConfig.ignore_patterns
preemption_mode: Optional[str] = None preemption_mode: Optional[str] = None
scheduler_delay_factor: float = 0.0 scheduler_delay_factor: float = 0.0
enable_chunked_prefill: Optional[bool] = None enable_chunked_prefill: Optional[bool] = None
disable_chunked_mm_input: bool = False
guided_decoding_backend: str = 'xgrammar' guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
logits_processor_pattern: Optional[str] = None logits_processor_pattern: Optional[str] = None
speculative_config: Optional[Dict[str, Any]] = None speculative_config: Optional[Dict[str, Any]] = None
...@@ -194,8 +200,8 @@ class EngineArgs: ...@@ -194,8 +200,8 @@ class EngineArgs:
override_neuron_config: Optional[Dict[str, Any]] = None override_neuron_config: Optional[Dict[str, Any]] = None
override_pooler_config: Optional[PoolerConfig] = None override_pooler_config: Optional[PoolerConfig] = None
compilation_config: Optional[CompilationConfig] = None compilation_config: Optional[CompilationConfig] = None
worker_cls: str = "auto" worker_cls: str = ParallelConfig.worker_cls
worker_extension_cls: str = "" worker_extension_cls: str = ParallelConfig.worker_extension_cls
kv_transfer_config: Optional[KVTransferConfig] = None kv_transfer_config: Optional[KVTransferConfig] = None
...@@ -209,7 +215,7 @@ class EngineArgs: ...@@ -209,7 +215,7 @@ class EngineArgs:
additional_config: Optional[Dict[str, Any]] = None additional_config: Optional[Dict[str, Any]] = None
enable_reasoning: Optional[bool] = None enable_reasoning: Optional[bool] = None
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
use_tqdm_on_load: bool = True use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
def __post_init__(self): def __post_init__(self):
if not self.tokenizer: if not self.tokenizer:
...@@ -229,6 +235,39 @@ class EngineArgs: ...@@ -229,6 +235,39 @@ class EngineArgs:
@staticmethod @staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Shared CLI arguments for vLLM engine.""" """Shared CLI arguments for vLLM engine."""
def is_type_in_union(cls: type[Any], type: type[Any]) -> bool:
"""Check if the class is a type in a union type."""
return get_origin(cls) is Union and type in get_args(cls)
def is_optional(cls: type[Any]) -> bool:
"""Check if the class is an optional type."""
return is_type_in_union(cls, type(None))
def get_kwargs(cls: type[Any]) -> Dict[str, Any]:
cls_docs = get_attr_docs(cls)
kwargs = {}
for field in fields(cls):
name = field.name
# One of these will always be present
default = (field.default_factory
if field.default is MISSING else field.default)
kwargs[name] = {"default": default, "help": cls_docs[name]}
# When using action="store_true"
# add_argument doesn't accept type
if field.type is bool:
continue
# Handle optional fields
if is_optional(field.type):
kwargs[name]["type"] = nullable_str
continue
# Handle str in union fields
if is_type_in_union(field.type, str):
kwargs[name]["type"] = str
continue
kwargs[name]["type"] = field.type
return kwargs
# Model arguments # Model arguments
parser.add_argument( parser.add_argument(
'--model', '--model',
...@@ -304,38 +343,23 @@ class EngineArgs: ...@@ -304,38 +343,23 @@ class EngineArgs:
"from directories specified by the server file system. " "from directories specified by the server file system. "
"This is a security risk. " "This is a security risk. "
"Should only be enabled in trusted environments.") "Should only be enabled in trusted environments.")
parser.add_argument('--download-dir', # Model loading arguments
type=nullable_str, load_kwargs = get_kwargs(LoadConfig)
default=EngineArgs.download_dir, load_group = parser.add_argument_group(
help='Directory to download and load the weights.') title="LoadConfig",
parser.add_argument( description=LoadConfig.__doc__,
'--load-format', )
type=str, load_group.add_argument('--load-format',
default=EngineArgs.load_format, choices=[f.value for f in LoadFormat],
choices=[f.value for f in LoadFormat], **load_kwargs["load_format"])
help='The format of the model weights to load.\n\n' load_group.add_argument('--download-dir',
'* "auto" will try to load the weights in the safetensors format ' **load_kwargs["download_dir"])
'and fall back to the pytorch bin format if safetensors format ' load_group.add_argument('--model-loader-extra-config',
'is not available.\n' **load_kwargs["model_loader_extra_config"])
'* "pt" will load the weights in the pytorch bin format.\n' load_group.add_argument('--use-tqdm-on-load',
'* "safetensors" will load the weights in the safetensors format.\n' action=argparse.BooleanOptionalAction,
'* "npcache" will load the weights in pytorch format and store ' **load_kwargs["use_tqdm_on_load"])
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples '
'section for more information.\n'
'* "runai_streamer" will load the Safetensors weights using Run:ai'
'Model Streamer.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n'
'* "sharded_state" will load weights from pre-sharded checkpoint '
'files, supporting efficient loading of tensor-parallel models\n'
'* "gguf" will load weights from GGUF format files (details '
'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
'* "mistral" will load weights from consolidated safetensors files '
'used by Mistral models.\n')
parser.add_argument( parser.add_argument(
'--config-format', '--config-format',
default=EngineArgs.config_format, default=EngineArgs.config_format,
...@@ -367,20 +391,24 @@ class EngineArgs: ...@@ -367,20 +391,24 @@ class EngineArgs:
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument('--max-model-len', parser.add_argument('--max-model-len',
type=int, type=human_readable_int,
default=EngineArgs.max_model_len, default=EngineArgs.max_model_len,
help='Model context length. If unspecified, will ' help='Model context length. If unspecified, will '
'be automatically derived from the model config.') 'be automatically derived from the model config. '
'Supports k/m/g/K/M/G in human-readable format.\n'
'Examples:\n'
'- 1k → 1000\n'
'- 1K → 1024\n')
parser.add_argument( parser.add_argument(
'--guided-decoding-backend', '--guided-decoding-backend',
type=str, type=str,
default='xgrammar', default=DecodingConfig.guided_decoding_backend,
help='Which engine will be used for guided decoding' help='Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support ' ' (JSON schema / regex etc) by default. Currently support '
'https://github.com/mlc-ai/xgrammar and ' 'https://github.com/mlc-ai/xgrammar and '
'https://github.com/guidance-ai/llguidance.' 'https://github.com/guidance-ai/llguidance.'
'Valid backend values are "xgrammar", "guidance", and "auto". ' 'Valid backend values are "xgrammar", "guidance", and "auto". '
'With "auto", we will make opinionated choices based on request' 'With "auto", we will make opinionated choices based on request '
'contents and what the backend libraries currently support, so ' 'contents and what the backend libraries currently support, so '
'the behavior is subject to change in each release.') 'the behavior is subject to change in each release.')
parser.add_argument( parser.add_argument(
...@@ -404,52 +432,37 @@ class EngineArgs: ...@@ -404,52 +432,37 @@ class EngineArgs:
'* "transformers" will use the Transformers model ' '* "transformers" will use the Transformers model '
'implementation.\n') 'implementation.\n')
# Parallel arguments # Parallel arguments
parser.add_argument( parallel_kwargs = get_kwargs(ParallelConfig)
parallel_group = parser.add_argument_group(
title="ParallelConfig",
description=ParallelConfig.__doc__,
)
parallel_group.add_argument(
'--distributed-executor-backend', '--distributed-executor-backend',
choices=['ray', 'mp', 'uni', 'external_launcher'], choices=['ray', 'mp', 'uni', 'external_launcher'],
default=EngineArgs.distributed_executor_backend, **parallel_kwargs["distributed_executor_backend"])
help='Backend to use for distributed model ' parallel_group.add_argument(
'workers, either "ray" or "mp" (multiprocessing). If the product ' '--pipeline-parallel-size', '-pp',
'of pipeline_parallel_size and tensor_parallel_size is less than ' **parallel_kwargs["pipeline_parallel_size"])
'or equal to the number of GPUs available, "mp" will be used to ' parallel_group.add_argument('--tensor-parallel-size', '-tp',
'keep processing on a single host. Otherwise, this will default ' **parallel_kwargs["tensor_parallel_size"])
'to "ray" if Ray is installed and fail otherwise. Note that tpu ' parallel_group.add_argument('--data-parallel-size', '-dp',
'only supports Ray for distributed inference.') **parallel_kwargs["data_parallel_size"])
parallel_group.add_argument(
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
default=EngineArgs.pipeline_parallel_size,
help='Number of pipeline stages.')
parser.add_argument('--tensor-parallel-size',
'-tp',
type=int,
default=EngineArgs.tensor_parallel_size,
help='Number of tensor parallel replicas.')
parser.add_argument('--data-parallel-size',
'-dp',
type=int,
default=EngineArgs.data_parallel_size,
help='Number of data parallel replicas. '
'MoE layers will be sharded according to the '
'product of the tensor-parallel-size and '
'data-parallel-size.')
parser.add_argument(
'--enable-expert-parallel', '--enable-expert-parallel',
action='store_true', action='store_true',
help='Use expert parallelism instead of tensor parallelism ' **parallel_kwargs["enable_expert_parallel"])
'for MoE layers.') parallel_group.add_argument(
parser.add_argument(
'--max-parallel-loading-workers', '--max-parallel-loading-workers',
type=int, **parallel_kwargs["max_parallel_loading_workers"])
default=EngineArgs.max_parallel_loading_workers, parallel_group.add_argument(
help='Load model sequentially in multiple batches, '
'to avoid RAM OOM when using tensor '
'parallel and large models.')
parser.add_argument(
'--ray-workers-use-nsight', '--ray-workers-use-nsight',
action='store_true', action='store_true',
help='If specified, use nsight to profile Ray workers.') **parallel_kwargs["ray_workers_use_nsight"])
parallel_group.add_argument(
'--disable-custom-all-reduce',
action='store_true',
**parallel_kwargs["disable_custom_all_reduce"])
# KV cache arguments # KV cache arguments
parser.add_argument('--block-size', parser.add_argument('--block-size',
type=int, type=int,
...@@ -602,6 +615,16 @@ class EngineArgs: ...@@ -602,6 +615,16 @@ class EngineArgs:
help='RoPE theta. Use with `rope_scaling`. In ' help='RoPE theta. Use with `rope_scaling`. In '
'some cases, changing the RoPE theta improves the ' 'some cases, changing the RoPE theta improves the '
'performance of the scaled model.') 'performance of the scaled model.')
parser.add_argument(
'--hf-token',
type=str,
nargs='?',
const=True,
default=None,
help='The token to use as HTTP bearer authorization'
' for remote files. If `True`, will use the token '
'generated when running `huggingface-cli login` '
'(stored in `~/.huggingface`).')
parser.add_argument('--hf-overrides', parser.add_argument('--hf-overrides',
type=json.loads, type=json.loads,
default=EngineArgs.hf_overrides, default=EngineArgs.hf_overrides,
...@@ -622,10 +645,6 @@ class EngineArgs: ...@@ -622,10 +645,6 @@ class EngineArgs:
'Additionally for encoder-decoder models, if the ' 'Additionally for encoder-decoder models, if the '
'sequence length of the encoder input is larger ' 'sequence length of the encoder input is larger '
'than this, we fall back to the eager mode.') 'than this, we fall back to the eager mode.')
parser.add_argument('--disable-custom-all-reduce',
action='store_true',
default=EngineArgs.disable_custom_all_reduce,
help='See ParallelConfig.')
parser.add_argument('--tokenizer-pool-size', parser.add_argument('--tokenizer-pool-size',
type=int, type=int,
default=EngineArgs.tokenizer_pool_size, default=EngineArgs.tokenizer_pool_size,
...@@ -652,13 +671,13 @@ class EngineArgs: ...@@ -652,13 +671,13 @@ class EngineArgs:
type=nullable_kvs, type=nullable_kvs,
default=EngineArgs.limit_mm_per_prompt, default=EngineArgs.limit_mm_per_prompt,
# The default value is given in # The default value is given in
# MultiModalConfig.get_limit_per_prompt # MultiModalConfig.get_default_limit_per_prompt
help=('For each multimodal plugin, limit how many ' help=('For each multimodal plugin, limit how many '
'input instances to allow for each prompt. ' 'input instances to allow for each prompt. '
'Expects a comma-separated list of items, ' 'Expects a comma-separated list of items, '
'e.g.: `image=16,video=2` allows a maximum of 16 ' 'e.g.: `image=16,video=2` allows a maximum of 16 '
'images and 2 videos per prompt. Defaults to 1 for ' 'images and 2 videos per prompt. Defaults to '
'each modality.')) '1 (V0) or 999 (V1) for each modality.'))
parser.add_argument( parser.add_argument(
'--mm-processor-kwargs', '--mm-processor-kwargs',
default=None, default=None,
...@@ -746,14 +765,6 @@ class EngineArgs: ...@@ -746,14 +765,6 @@ class EngineArgs:
default=1, default=1,
help=('Maximum number of forward steps per ' help=('Maximum number of forward steps per '
'scheduler call.')) 'scheduler call.'))
parser.add_argument(
'--use-tqdm-on-load',
dest='use_tqdm_on_load',
action=argparse.BooleanOptionalAction,
default=EngineArgs.use_tqdm_on_load,
help='Whether to enable/disable progress bar '
'when loading model weights.',
)
parser.add_argument( parser.add_argument(
'--multi-step-stream-outputs', '--multi-step-stream-outputs',
...@@ -782,15 +793,6 @@ class EngineArgs: ...@@ -782,15 +793,6 @@ class EngineArgs:
default=None, default=None,
help='The configurations for speculative decoding.' help='The configurations for speculative decoding.'
' Should be a JSON string.') ' Should be a JSON string.')
parser.add_argument('--model-loader-extra-config',
type=nullable_str,
default=EngineArgs.model_loader_extra_config,
help='Extra config for model loader. '
'This will be passed to the model loader '
'corresponding to the chosen load_format. '
'This should be a JSON string that will be '
'parsed into a dictionary.')
parser.add_argument( parser.add_argument(
'--ignore-patterns', '--ignore-patterns',
action="append", action="append",
...@@ -1001,6 +1003,20 @@ class EngineArgs: ...@@ -1001,6 +1003,20 @@ class EngineArgs:
"Note that even if this is set to False, cascade attention will be " "Note that even if this is set to False, cascade attention will be "
"only used when the heuristic tells that it's beneficial.") "only used when the heuristic tells that it's beneficial.")
parser.add_argument(
"--disable-chunked-mm-input",
action=StoreBoolean,
default=EngineArgs.disable_chunked_mm_input,
nargs="?",
const="True",
help="Disable multimodal input chunking attention for V1. "
"If set to true and chunked prefill is enabled, we do not want to"
" partially schedule a multimodal item. This ensures that if a "
"request has a mixed prompt (like text tokens TTTT followed by "
"image tokens IIIIIIIIII) where only some image tokens can be "
"scheduled (like TTTTIIIII, leaving IIIII), it will be scheduled "
"as TTTT in one step and IIIIIIIIII in the next.")
return parser return parser
@classmethod @classmethod
...@@ -1038,6 +1054,7 @@ class EngineArgs: ...@@ -1038,6 +1054,7 @@ class EngineArgs:
code_revision=self.code_revision, code_revision=self.code_revision,
rope_scaling=self.rope_scaling, rope_scaling=self.rope_scaling,
rope_theta=self.rope_theta, rope_theta=self.rope_theta,
hf_token=self.hf_token,
hf_overrides=self.hf_overrides, hf_overrides=self.hf_overrides,
tokenizer_revision=self.tokenizer_revision, tokenizer_revision=self.tokenizer_revision,
max_model_len=self.max_model_len, max_model_len=self.max_model_len,
...@@ -1244,6 +1261,7 @@ class EngineArgs: ...@@ -1244,6 +1261,7 @@ class EngineArgs:
num_lookahead_slots=num_lookahead_slots, num_lookahead_slots=num_lookahead_slots,
delay_factor=self.scheduler_delay_factor, delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill, enable_chunked_prefill=self.enable_chunked_prefill,
disable_chunked_mm_input=self.disable_chunked_mm_input,
is_multimodal_model=model_config.is_multimodal_model, is_multimodal_model=model_config.is_multimodal_model,
preemption_mode=self.preemption_mode, preemption_mode=self.preemption_mode,
num_scheduler_steps=self.num_scheduler_steps, num_scheduler_steps=self.num_scheduler_steps,
...@@ -1275,6 +1293,10 @@ class EngineArgs: ...@@ -1275,6 +1293,10 @@ class EngineArgs:
self.model_loader_extra_config[ self.model_loader_extra_config[
"qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
# bitsandbytes pre-quantized model need a specific model loader
if model_config.quantization == "bitsandbytes":
self.quantization = self.load_format = "bitsandbytes"
load_config = self.create_load_config() load_config = self.create_load_config()
prompt_adapter_config = PromptAdapterConfig( prompt_adapter_config = PromptAdapterConfig(
...@@ -1650,12 +1672,14 @@ class EngineArgs: ...@@ -1650,12 +1672,14 @@ class EngineArgs:
UsageContext.LLM_CLASS: 16384, UsageContext.LLM_CLASS: 16384,
UsageContext.OPENAI_API_SERVER: 8192, UsageContext.OPENAI_API_SERVER: 8192,
} }
default_max_num_seqs = 1024
else: else:
# TODO(woosuk): Tune the default values for other hardware. # TODO(woosuk): Tune the default values for other hardware.
default_max_num_batched_tokens = { default_max_num_batched_tokens = {
UsageContext.LLM_CLASS: 8192, UsageContext.LLM_CLASS: 8192,
UsageContext.OPENAI_API_SERVER: 2048, UsageContext.OPENAI_API_SERVER: 2048,
} }
default_max_num_seqs = 256
use_context_value = usage_context.value if usage_context else None use_context_value = usage_context.value if usage_context else None
if (self.max_num_batched_tokens is None if (self.max_num_batched_tokens is None
...@@ -1666,7 +1690,6 @@ class EngineArgs: ...@@ -1666,7 +1690,6 @@ class EngineArgs:
"Setting max_num_batched_tokens to %d for %s usage context.", "Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, use_context_value) self.max_num_batched_tokens, use_context_value)
default_max_num_seqs = 1024
if self.max_num_seqs is None: if self.max_num_seqs is None:
self.max_num_seqs = default_max_num_seqs self.max_num_seqs = default_max_num_seqs
...@@ -1723,6 +1746,47 @@ def _warn_or_fallback(feature_name: str) -> bool: ...@@ -1723,6 +1746,47 @@ def _warn_or_fallback(feature_name: str) -> bool:
return should_exit return should_exit
def human_readable_int(value):
"""Parse human-readable integers like '1k', '2M', etc.
Including decimal values with decimal multipliers.
Examples:
- '1k' -> 1,000
- '1K' -> 1,024
- '25.6k' -> 25,600
"""
value = value.strip()
match = re.fullmatch(r'(\d+(?:\.\d+)?)([kKmMgGtT])', value)
if match:
decimal_multiplier = {
'k': 10**3,
'm': 10**6,
'g': 10**9,
}
binary_multiplier = {
'K': 2**10,
'M': 2**20,
'G': 2**30,
}
number, suffix = match.groups()
if suffix in decimal_multiplier:
mult = decimal_multiplier[suffix]
return int(float(number) * mult)
elif suffix in binary_multiplier:
mult = binary_multiplier[suffix]
# Do not allow decimals with binary multipliers
try:
return int(number) * mult
except ValueError as e:
raise argparse.ArgumentTypeError("Decimals are not allowed " \
f"with binary suffixes like {suffix}. Did you mean to use " \
f"{number}{suffix.lower()} instead?") from e
# Regular plain number.
return int(value)
# These functions are used by sphinx to build the documentation # These functions are used by sphinx to build the documentation
def _engine_args_parser(): def _engine_args_parser():
return EngineArgs.add_cli_args(FlexibleArgumentParser()) return EngineArgs.add_cli_args(FlexibleArgumentParser())
......
...@@ -8,7 +8,7 @@ from contextlib import contextmanager ...@@ -8,7 +8,7 @@ from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
Iterable, List, Mapping, NamedTuple, Optional) Iterable, List, Literal, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload from typing import Set, Type, Union, cast, overload
...@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.logits_processors import ( ...@@ -30,7 +30,7 @@ from vllm.entrypoints.openai.logits_processors import (
get_logits_processors as get_openai_logits_processors) get_logits_processors as get_openai_logits_processors)
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
PromptType) PromptType, SingletonInputs)
from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
from vllm.inputs.preprocess import InputPreprocessor from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -40,6 +40,7 @@ from vllm.model_executor.guided_decoding import ( ...@@ -40,6 +40,7 @@ from vllm.model_executor.guided_decoding import (
get_local_guided_decoding_logits_processor) get_local_guided_decoding_logits_processor)
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.processing import EncDecMultiModalProcessor
from vllm.outputs import (PoolingRequestOutput, RequestOutput, from vllm.outputs import (PoolingRequestOutput, RequestOutput,
RequestOutputFactory) RequestOutputFactory)
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
...@@ -2029,29 +2030,61 @@ class LLMEngine: ...@@ -2029,29 +2030,61 @@ class LLMEngine:
lora_request: Optional[LoRARequest]): lora_request: Optional[LoRARequest]):
encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs) encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
# For encoder-decoder multimodal models, the max_prompt_len if encoder_inputs is not None:
# restricts the decoder prompt length self._validate_model_input(encoder_inputs,
if self.model_config.is_multimodal_model: lora_request,
prompt_inputs = decoder_inputs prompt_type="encoder")
else:
prompt_inputs = encoder_inputs or decoder_inputs
prompt_ids = prompt_inputs["prompt_token_ids"]
if prompt_ids is None or len(prompt_ids) == 0: self._validate_model_input(decoder_inputs,
raise ValueError("Prompt cannot be empty") lora_request,
prompt_type="decoder")
if self.model_config.is_multimodal_model: def _validate_model_input(
max_prompt_len = self.model_config.max_model_len self,
prompt_inputs: SingletonInputs,
lora_request: Optional[LoRARequest],
*,
prompt_type: Literal["encoder", "decoder"],
):
model_config = self.model_config
tokenizer = (None if self.tokenizer is None else
self.tokenizer.get_lora_tokenizer(lora_request))
if len(prompt_ids) > max_prompt_len: prompt_ids = prompt_inputs["prompt_token_ids"]
raise ValueError( if not prompt_ids:
f"The prompt (total length {len(prompt_ids)}) is too long " if prompt_type == "encoder" and model_config.is_multimodal_model:
f"to fit into the model (context length {max_prompt_len}). " pass # Mllama may have empty encoder inputs for text-only data
else:
raise ValueError(f"The {prompt_type} prompt cannot be empty")
max_prompt_len = self.model_config.max_model_len
if len(prompt_ids) >= max_prompt_len:
if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
model_config,
tokenizer=tokenizer or object(), # Dummy if no tokenizer
)
assert isinstance(mm_processor, EncDecMultiModalProcessor)
if mm_processor.pad_dummy_encoder_prompt:
return # Skip encoder length check for Whisper
if model_config.is_multimodal_model:
suggestion = (
"Make sure that `max_model_len` is no smaller than the " "Make sure that `max_model_len` is no smaller than the "
"number of text tokens plus multimodal tokens. For image " "number of text tokens plus multimodal tokens. For image "
"inputs, the number of image tokens depends on the number " "inputs, the number of image tokens depends on the number "
"of images, and possibly their aspect ratios as well.") "of images, and possibly their aspect ratios as well.")
else:
suggestion = (
"Make sure that `max_model_len` is no smaller than the "
"number of text tokens.")
raise ValueError(
f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
f"longer than the maximum model length of {max_prompt_len}. "
f"{suggestion}")
# TODO: Find out how many placeholder tokens are there so we can # TODO: Find out how many placeholder tokens are there so we can
# check that chunked prefill does not truncate them # check that chunked prefill does not truncate them
......
...@@ -156,7 +156,8 @@ class Metrics: ...@@ -156,7 +156,8 @@ class Metrics:
labelnames=labelnames, labelnames=labelnames,
buckets=[ buckets=[
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0
]) ])
self.histogram_time_per_output_token = self._histogram_cls( self.histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds", name="vllm:time_per_output_token_seconds",
...@@ -164,14 +165,14 @@ class Metrics: ...@@ -164,14 +165,14 @@ class Metrics:
labelnames=labelnames, labelnames=labelnames,
buckets=[ buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
]) ])
# Request stats # Request stats
# Latency # Latency
request_latency_buckets = [ request_latency_buckets = [
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
] ]
self.histogram_e2e_time_request = self._histogram_cls( self.histogram_e2e_time_request = self._histogram_cls(
name="vllm:e2e_request_latency_seconds", name="vllm:e2e_request_latency_seconds",
......
...@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): ...@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
externally (before the next schedule() call) externally (before the next schedule() call)
""" """
# Sequences can be in RUNNING or FINISHED_ABORTED state # Sequences can be in RUNNING or FINISHED_ABORTED state
# once scheduled, as a sequence is moved to FINSIHED_ABORTED # once scheduled, as a sequence is moved to FINISHED_ABORTED
# if a client disconnects from the api server. # if a client disconnects from the api server.
seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
if seqs is None: if seqs is None:
......
...@@ -35,7 +35,7 @@ from typing_extensions import Required, TypeAlias, TypedDict ...@@ -35,7 +35,7 @@ from typing_extensions import Required, TypeAlias, TypedDict
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal import MultiModalDataDict from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.utils import MediaConnector from vllm.multimodal.utils import MediaConnector
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
...@@ -452,8 +452,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -452,8 +452,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
self._model_config = model_config self._model_config = model_config
self._tokenizer = tokenizer self._tokenizer = tokenizer
self._allowed_items = (model_config.multimodal_config.limit_per_prompt
if model_config.multimodal_config else {})
self._items_by_modality = defaultdict[str, list[_T]](list) self._items_by_modality = defaultdict[str, list[_T]](list)
...@@ -465,6 +463,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -465,6 +463,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
def allowed_local_media_path(self): def allowed_local_media_path(self):
return self._model_config.allowed_local_media_path return self._model_config.allowed_local_media_path
@property
def mm_registry(self):
return MULTIMODAL_REGISTRY
@staticmethod @staticmethod
@cache @cache
def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str:
...@@ -487,8 +489,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -487,8 +489,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|endoftext10|>" # 200010 (see vocab.json in hf model) return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"): if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)" return "(<image>./</image>)"
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral", if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
"mistral3"): "pixtral", "mistral3"):
# These models do not use image tokens in the prompt # These models do not use image tokens in the prompt
return None return None
if model_type == "qwen": if model_type == "qwen":
...@@ -498,7 +500,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -498,7 +500,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config.image_token_index) hf_config.image_token_index)
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2", if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
"internvl_chat", "skywork_chat", "NVLM_D", "internvl_chat", "skywork_chat", "NVLM_D",
"h2ovl_chat"): "h2ovl_chat", "idefics3", "smolvlm"):
return "<image>" return "<image>"
if model_type in ("mllama", "llama4"): if model_type in ("mllama", "llama4"):
return "<|image|>" return "<|image|>"
...@@ -506,8 +508,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -506,8 +508,6 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|vision_start|><|image_pad|><|vision_end|>" return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "molmo": if model_type == "molmo":
return "" return ""
if model_type == "idefics3":
return "<image>"
if model_type == "aria": if model_type == "aria":
return "<|fim_prefix|><|img|><|fim_suffix|>" return "<|fim_prefix|><|img|><|fim_suffix|>"
if model_type == "gemma3": if model_type == "gemma3":
...@@ -542,12 +542,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -542,12 +542,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
Add a multi-modal item to the current prompt and returns the Add a multi-modal item to the current prompt and returns the
placeholder string to use, if any. placeholder string to use, if any.
""" """
allowed_count = self._allowed_items.get(modality, 1) mm_registry = self.mm_registry
model_config = self.model_config
input_modality = modality.replace("_embeds", "")
if mm_registry.has_processor(model_config):
mm_processor = mm_registry.create_processor(model_config)
allowed_counts = mm_processor.info.get_allowed_mm_limits()
allowed_count = allowed_counts.get(input_modality, 0)
else:
mm_config = model_config.multimodal_config
if mm_config is None:
msg = "This model does not support multi-modal inputs"
raise ValueError(msg)
allowed_count = mm_config.get_limit_per_prompt(input_modality)
current_count = len(self._items_by_modality[modality]) + 1 current_count = len(self._items_by_modality[modality]) + 1
if current_count > allowed_count: if current_count > allowed_count:
raise ValueError( raise ValueError(
f"At most {allowed_count} {modality}(s) may be provided in " f"At most {allowed_count} {modality}(s) may be provided in "
"one request.") "one request. You can set `--limit-mm-per-prompt` to "
"increase this limit if the model supports it.")
self._items_by_modality[modality].append(item) self._items_by_modality[modality].append(item)
...@@ -874,19 +891,19 @@ MM_PARSER_MAP: dict[ ...@@ -874,19 +891,19 @@ MM_PARSER_MAP: dict[
Callable[[ChatCompletionContentPartParam], _ContentPart], Callable[[ChatCompletionContentPartParam], _ContentPart],
] = { ] = {
"text": "text":
lambda part: _TextParser(part).get("text", ""), lambda part: _TextParser(part).get("text", None),
"image_url": "image_url":
lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
"image_embeds": "image_embeds":
lambda part: _ImageEmbedsParser(part).get("image_embeds", {}), lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
"audio_url": "audio_url":
lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
"input_audio": "input_audio":
lambda part: _InputAudioParser(part).get("input_audio", {}), lambda part: _InputAudioParser(part).get("input_audio", None),
"refusal": "refusal":
lambda part: _RefusalParser(part).get("refusal", ""), lambda part: _RefusalParser(part).get("refusal", None),
"video_url": "video_url":
lambda part: _VideoParser(part).get("video_url", {}).get("url", ""), lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
} }
...@@ -1005,11 +1022,11 @@ def _parse_chat_message_content_part( ...@@ -1005,11 +1022,11 @@ def _parse_chat_message_content_part(
part_type, content = _parse_chat_message_content_mm_part(part) part_type, content = _parse_chat_message_content_mm_part(part)
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
# content is empty, log a warning and skip # content is None, log a warning and skip
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
logger.warning( logger.warning(
"Skipping multimodal part (type: '%s') " "Skipping multimodal part '%s' (type: '%s') "
"with empty / unparsable content.", part_type) "with empty / unparsable content.", part, part_type)
return None return None
if part_type in ("text", "refusal"): if part_type in ("text", "refusal"):
...@@ -1195,8 +1212,15 @@ def apply_mistral_chat_template( ...@@ -1195,8 +1212,15 @@ def apply_mistral_chat_template(
**kwargs, **kwargs,
) )
return tokenizer.apply_chat_template( try:
messages=messages, return tokenizer.apply_chat_template(
tools=tools, messages=messages,
**kwargs, tools=tools,
) **kwargs,
)
# mistral-common uses assert statements to stop processing of input
# if input does not comply with the expected format.
# We convert those assertion errors to ValueErrors so they can be
# are properly caught in the preprocessing_input step
except AssertionError as e:
raise ValueError from e
...@@ -32,6 +32,7 @@ class BenchmarkSubcommandBase(CLISubcommand): ...@@ -32,6 +32,7 @@ class BenchmarkSubcommandBase(CLISubcommand):
parser = subparsers.add_parser( parser = subparsers.add_parser(
self.name, self.name,
help=self.help, help=self.help,
description=self.help,
usage=f"vllm bench {self.name} [options]") usage=f"vllm bench {self.name} [options]")
self.add_cli_args(parser) self.add_cli_args(parser)
return parser return parser
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment