Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
......@@ -6,6 +6,7 @@ from collections.abc import Sequence
from enum import Enum, auto
from random import choices
from string import ascii_letters, digits
from typing import Any
import ijson
import regex as re
......@@ -20,11 +21,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import MistralTokenizer, TokenizerLike
logger = init_logger(__name__)
......@@ -84,6 +86,7 @@ class MistralToolParser(ToolParser):
# initialize properties used for state when parsing tool calls in
# streaming mode
self.prev_tool_call_arr: list[dict[str, Any]] = []
self.current_tool_id: int = -1
self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START
......
......@@ -18,10 +18,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
......
......@@ -12,10 +12,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
if TYPE_CHECKING:
from vllm.tokenizers import TokenizerLike
......@@ -43,6 +43,7 @@ class OpenAIToolParser(ToolParser):
parser = parse_output_into_messages(token_ids)
tool_calls = []
final_content = None
commentary_content = None
if len(parser.messages) > 0:
for msg in parser.messages:
......@@ -75,11 +76,15 @@ class OpenAIToolParser(ToolParser):
)
elif msg.channel == "final":
final_content = msg_text
elif msg.channel == "commentary" and not msg.recipient:
commentary_content = msg_text
return ExtractedToolCallInformation(
tools_called=len(tool_calls) > 0,
tool_calls=tool_calls,
content=final_content,
# prefer final content over commentary content if both are present
# commentary content is tool call preambles meant to be shown to the user
content=final_content or commentary_content,
)
def extract_tool_calls_streaming(
......
......@@ -16,10 +16,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
......
......@@ -19,10 +19,10 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.logger import init_logger
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
......
......@@ -18,11 +18,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
logger = init_logger(__name__)
......
......@@ -19,11 +19,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
logger = init_logger(__name__)
......
......@@ -21,11 +21,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
logger = init_logger(__name__)
......
......@@ -17,11 +17,11 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.utils import random_uuid
logger = init_logger(__name__)
......
......@@ -17,7 +17,7 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
from vllm.tool_parsers.abstract_tool_parser import (
ToolParser,
)
from vllm.logger import init_logger
......
......@@ -66,6 +66,7 @@ class LazyConfigDict(dict):
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
afmoe="AfmoeConfig",
bagel="BagelConfig",
chatglm="ChatGLMConfig",
deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v32="DeepseekV3Config",
......@@ -306,8 +307,13 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
"""Provide backwards compatibility for RoPE."""
from vllm.config.utils import getattr_iter
rope_theta_names = ("rope_theta", "rotary_emb_base")
rope_theta = getattr_iter(config, rope_theta_names, None)
# Older custom models may use non-standard field names
# which need patching for both Transformers v4 and v5.
names = ["rope_theta", "rotary_emb_base"]
rope_theta = getattr_iter(config, names, None, warn=True)
names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
partial_rotary_factor = getattr_iter(config, names, None, warn=True)
if Version(version("transformers")) < Version("5.0.0.dev0"):
# Transformers v4 installed, legacy config fields may be present
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
......@@ -316,14 +322,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
if not hasattr(config, "rope_parameters"):
config.rope_parameters = {"rope_type": "default"}
config.rope_parameters["rope_theta"] = rope_theta
partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
if partial_rotary_factor is not None:
if not hasattr(config, "rope_parameters"):
config.rope_parameters = {"rope_type": "default"}
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
elif rope_theta is not None or hasattr(config, "rope_parameters"):
# Transformers v5 installed
# Patch these fields in case they used non-standard names
if rope_theta is not None:
config.rope_theta = rope_theta
if partial_rotary_factor is not None:
config.partial_rotary_factor = partial_rotary_factor
# Standardize and validate RoPE parameters
config.standardize_rope_params()
config.validate_rope()
......@@ -608,6 +618,28 @@ def get_config(
hf_overrides=hf_overrides_kw,
**kwargs,
)
# Patching defaults for GGUF models
if _is_gguf:
# Some models have different default values between GGUF and HF.
def apply_gguf_default(key: str, gguf_default: Any):
"""
Apply GGUF defaults unless explicitly configured.
This function reads/writes external `config` and `config_dict`.
If the specified `key` is not in `config_dict` (i.e. not explicitly
configured and the default HF value is used), it updates the
corresponding `config` value to `gguf_default`.
"""
if key not in config_dict:
config.update({key: gguf_default})
# Apply architecture-specific GGUF defaults.
if config.model_type in {"qwen3_moe"}:
# Qwen3 MoE: norm_topk_prob is always true.
# Note that, this parameter is always false (HF default) on Qwen2 MoE.
apply_gguf_default("norm_topk_prob", True)
# Special architecture mapping check for GGUF models
if _is_gguf:
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
......
......@@ -16,6 +16,7 @@ import importlib
_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"BagelConfig": "vllm.transformers_utils.configs.bagel",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
......@@ -54,6 +55,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
__all__ = [
"AfmoeConfig",
"BagelConfig",
"ChatGLMConfig",
"DeepseekVLV2Config",
"DeepseekV3Config",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.models.qwen2 import Qwen2Config
class BagelConfig(PretrainedConfig):
"""Configuration class for BAGEL model."""
model_type = "bagel"
def __init__(
self,
visual_gen: bool = True,
visual_und: bool = True,
llm_config: dict | Qwen2Config | None = None,
vit_config: dict | SiglipVisionConfig | None = None,
vae_config: dict | None = None,
latent_patch_size: int = 2,
max_latent_size: int = 32,
vit_max_num_patch_per_side: int = 70,
connector_act: str = "gelu_pytorch_tanh",
interpolate_pos: bool = False,
timestep_shift: float = 1.0,
**kwargs,
):
super().__init__(**kwargs)
self.visual_gen = visual_gen
self.visual_und = visual_und
# Convert dict configs to proper config objects
if isinstance(llm_config, dict):
self.llm_config = Qwen2Config(**llm_config)
else:
self.llm_config = llm_config or Qwen2Config()
if isinstance(vit_config, dict):
self.vit_config = SiglipVisionConfig(**vit_config)
else:
self.vit_config = vit_config or SiglipVisionConfig()
self.vae_config = vae_config or {"z_channels": 16, "downsample": 8}
self.latent_patch_size = latent_patch_size
self.max_latent_size = max_latent_size
self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
self.connector_act = connector_act
self.interpolate_pos = interpolate_pos
self.timestep_shift = timestep_shift
@property
def hidden_size(self) -> int:
"""Return the hidden size of the language model."""
return self.llm_config.hidden_size
......@@ -8,6 +8,7 @@ reasons:
- There is a need to override the existing processor to support vLLM.
"""
from vllm.transformers_utils.processors.bagel import BagelProcessor
from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
......@@ -15,6 +16,7 @@ from vllm.transformers_utils.processors.ovis import OvisProcessor
from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
__all__ = [
"BagelProcessor",
"DeepseekVLV2Processor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
"""BAGEL processor for image and text inputs."""
from transformers import AutoProcessor
from transformers.image_utils import ImageInput
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
class BagelProcessor(ProcessorMixin):
"""
Constructs a BAGEL processor which wraps a
SigLIP image processor and a Qwen2 tokenizer.
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "SiglipImageProcessor"
tokenizer_class = "AutoTokenizer"
def __call__(
self,
text: TextInput
| PreTokenizedInput
| list[TextInput]
| list[PreTokenizedInput] = None,
images: ImageInput = None,
**kwargs,
):
"""
Main method to prepare for the model one or several sequences(s) and image(s).
"""
if images is not None:
# Process images with the image processor
# Ensure return_tensors is set to "pt" for PyTorch tensors
image_kwargs = {**kwargs}
if "return_tensors" not in image_kwargs:
image_kwargs["return_tensors"] = "pt"
pixel_values = self.image_processor(images, **image_kwargs)
else:
pixel_values = None
text_inputs = self.tokenizer(text, **kwargs) if text is not None else None
if pixel_values is not None and text_inputs is not None:
text_inputs["pixel_values"] = pixel_values["pixel_values"]
return text_inputs
elif pixel_values is not None:
return pixel_values
else:
return text_inputs
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's batch_decode.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's decode.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
AutoProcessor.register("BagelProcessor", BagelProcessor)
......@@ -17,7 +17,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
"`vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
......@@ -29,7 +29,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
"has been moved to `vllm.tokenizers.get_tokenizer`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
......@@ -41,7 +41,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
......@@ -53,29 +53,29 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
return cached_tokenizer_from_config
if name == "init_tokenizer_from_configs":
from vllm.tokenizers import init_tokenizer_from_config
from vllm.tokenizers import cached_tokenizer_from_config
warnings.warn(
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
"The old name will be removed in v0.13.",
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
return init_tokenizer_from_config
return cached_tokenizer_from_config
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
def decode_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
......@@ -97,7 +97,7 @@ def decode_tokens(
return tokenizer.decode(token_ids, **kw_args)
@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
def encode_tokens(
tokenizer: TokenizerLike,
text: str,
......
......@@ -11,7 +11,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
"moved to `vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
......@@ -23,7 +23,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
"moved to `vllm.tokenizers.TokenizerRegistry`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
......
......@@ -38,7 +38,7 @@ class DeepGemmQuantScaleFMT(Enum):
return DeepGemmQuantScaleFMT.FLOAT32
return (
DeepGemmQuantScaleFMT.UE8M0
if current_platform.is_device_capability(100)
if current_platform.is_device_capability_family(100)
else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
)
......@@ -50,7 +50,7 @@ def is_deep_gemm_supported() -> bool:
"""
is_supported_arch = current_platform.is_cuda() and (
current_platform.is_device_capability(90)
or current_platform.is_device_capability(100)
or current_platform.is_device_capability_family(100)
)
return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
......@@ -481,22 +481,6 @@ def should_use_deepgemm_for_fp8_linear(
)
def should_use_deepgemm_for_fp8_linear_for_nk(
output_dtype: torch.dtype,
shape0: int,
shape1: int,
supports_deep_gemm: bool | None = None,
):
if supports_deep_gemm is None:
supports_deep_gemm = is_deep_gemm_supported()
return (
supports_deep_gemm
and output_dtype == torch.bfloat16
and shape0 % 128 == 0
and shape1 % 128 == 0
)
__all__ = [
"calc_diff",
"DeepGemmQuantScaleFMT",
......@@ -511,7 +495,6 @@ __all__ = [
"is_deep_gemm_supported",
"get_num_sms",
"should_use_deepgemm_for_fp8_linear",
"should_use_deepgemm_for_fp8_linear_for_nk",
"get_col_major_tma_aligned_tensor",
"get_mk_alignment_for_contiguous_layout",
]
......@@ -264,11 +264,15 @@ def supports_trtllm_attention() -> bool:
return False
# Requires SM100 and NVIDIA artifactory to be accessible to download cubins
return current_platform.is_device_capability(100) and has_nvidia_artifactory()
return (
current_platform.is_device_capability_family(100) and has_nvidia_artifactory()
)
def force_use_trtllm_attention() -> bool | None:
"""
This function should only be called during initialization stage when vllm config
is set.
Return `None` if --attention-config.use_trtllm_attention is not set,
return `True` if TRTLLM attention is forced to be used,
return `False` if TRTLLM attention is forced to be not used.
......@@ -296,11 +300,12 @@ def use_trtllm_attention(
kv_cache_dtype: str,
q_dtype: torch.dtype,
is_prefill: bool,
# None means auto-detection, True means force on, False means force off
force_use_trtllm: bool | None = None,
has_sinks: bool = False,
has_spec: bool = False,
) -> bool:
"""Return `True` if TRTLLM attention is used."""
force_use_trtllm = force_use_trtllm_attention()
# CLI argument is set to 0 - respect it
if force_use_trtllm is not None and not force_use_trtllm:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment