Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
......@@ -1577,6 +1577,22 @@ class VllmConfig:
compile_range_end,
)
if compilation_config.pass_config.fuse_minimax_qk_norm:
from vllm.compilation.passes.fusion.minimax_qk_norm_fusion import (
MAX_TOKEN_NUM,
)
max_token_num = min(
MAX_TOKEN_NUM, self.scheduler_config.max_num_batched_tokens
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_endpoints.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below MiniMax QK norm fusion threshold, "
"MiniMax QK norm fusion enabled for all num_tokens."
)
if compilation_config.compile_ranges_endpoints is not None:
for x in compilation_config.compile_ranges_endpoints:
assert isinstance(x, int)
......
......@@ -170,6 +170,7 @@ class AnthropicServingMessages(OpenAIServingChat):
else:
cls._convert_message_content(msg, openai_msg, openai_messages)
if not (msg.role == "user" and "content" not in openai_msg):
openai_messages.append(openai_msg)
@classmethod
......
......@@ -372,6 +372,7 @@ async def init_app_state(
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
......@@ -467,6 +468,7 @@ async def init_render_app_state(
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
......
......@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None,
tool_dicts=tool_dicts,
tool_parser=self.parser.tool_parser_cls if self.parser else None,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
)
return messages, engine_inputs
......@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None,
tool_dicts=tool_dicts,
tool_parser=tool_parser,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
)
return engine_inputs
......
......@@ -44,6 +44,7 @@ from vllm.inputs import (
)
from vllm.logger import init_logger
from vllm.parser import ParserManager
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs.preprocess import (
extract_prompt_components,
......@@ -74,6 +75,7 @@ class OpenAIServingRender:
enable_auto_tools: bool = False,
exclude_tools_when_tool_choice_none: bool = False,
tool_parser: str | None = None,
reasoning_parser: str | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None,
log_error_stack: bool = False,
) -> None:
......@@ -94,6 +96,11 @@ class OpenAIServingRender:
enable_auto_tools=enable_auto_tools,
model_name=model_config.model,
)
self.reasoning_parser: type[ReasoningParser] | None = (
ParserManager.get_reasoning_parser(
reasoning_parser_name=reasoning_parser,
)
)
self.default_chat_template_kwargs: dict[str, Any] = (
default_chat_template_kwargs or {}
)
......@@ -245,6 +252,7 @@ class OpenAIServingRender:
default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts,
tool_parser=tool_parser,
reasoning_parser=self.reasoning_parser,
)
else:
# For GPT-OSS.
......@@ -498,6 +506,9 @@ class OpenAIServingRender:
default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: type[ToolParser] | None = None,
reasoning_parser: type[ReasoningParser] | None = None,
*,
skip_mm_cache: bool = False,
) -> tuple[list[ConversationMessage], list[EngineInput]]:
"""Copied from OpenAIServing._preprocess_chat."""
renderer = self.renderer
......@@ -531,6 +542,10 @@ class OpenAIServingRender:
},
)
if reasoning_parser is not None:
tokenizer = renderer.get_tokenizer()
request = reasoning_parser(tokenizer).adjust_request(request=request)
# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM
......
This diff is collapsed.
......@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found
"""
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it.
if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")):
......
This diff is collapsed.
......@@ -65,7 +65,12 @@ from vllm.multimodal.processing.processor import (
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .interfaces import (
MultiModalEmbeddings,
SupportsEagle3,
SupportsMultiModal,
SupportsPP,
)
from .utils import (
AutoWeightsLoader,
WeightsMapper,
......@@ -121,8 +126,12 @@ class Gemma4AudioInputs(TensorSchema):
"""
type: Literal["audio"] = "audio"
input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")]
input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")]
input_features_padded: Annotated[
torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
]
input_features_mask: Annotated[
torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
]
Gemma4ImageInputs = Gemma4ImagePixelInputs
......@@ -163,9 +172,14 @@ class Gemma4ProcessingInfo(BaseProcessingInfo):
Setting ``add_special_tokens=False`` here prevents the duplicate and
ensures both ``llm.generate()`` and the chat/completions API behave
correctly.
correctly for IT models. For PT models (without chat template), we
keep the default (True) to ensure BOS is added for raw prompts.
"""
tokenizer = self.ctx.get_tokenizer()
has_chat_template = getattr(tokenizer, "chat_template", None) is not None
params = super().get_default_tok_params()
if has_chat_template:
params = params.with_kwargs(add_special_tokens=False)
return params
......@@ -503,6 +517,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video: list[list[float]] = []
video_frame_counts: list[int] = []
video_replacements: list[str] = []
for item in videos:
video_array, metadata = item
......@@ -555,10 +571,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video.append(timestamps)
video_frame_counts.append(len(frames))
# Build expanded replacement text and replace the
# <|video|> placeholder in the prompt.
# Use split(token, 1) to avoid collision — the
# replacement text itself contains <|video|> tokens.
# Build expanded replacement text for this video.
ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
replacement = " ".join(
f"{t} {processor.boi_token}"
......@@ -566,9 +579,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
f"{processor.eoi_token}"
for t, n in zip(ts_strs, num_soft_per_frame)
)
parts = prompt.split(processor.video_token, 1)
if len(parts) == 2:
prompt = parts[0] + replacement + parts[1]
video_replacements.append(replacement)
# Replace all <|video|> placeholders at once. We split on
# video_token to get N+1 parts, then interleave with the
# N replacement strings. This avoids the iterative
# split-replace bug where replacement text (which itself
# contains <|video|> tokens) collides with later splits.
vt = processor.video_token
parts = prompt.split(vt, len(video_replacements))
# NOTE: len(parts) <= len(video_replacements) + 1
parts_with_repl: list[str] = []
for part, repl in zip(parts, video_replacements):
parts_with_repl.extend([part, repl])
parts_with_repl.extend(parts[len(video_replacements) :])
prompt = "".join(parts_with_repl)
video_outputs = {
"pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
......@@ -631,19 +658,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
)
if "input_features" in processed_outputs:
# Keep padded features for batched audio tower execution.
processed_outputs["input_features_padded"] = processed_outputs[
"input_features"
]
# Unpad per-item so each item's cache entry is self-contained.
# Unpad per-item so each item's cache entry is
# self-contained. The batched() field config in
# _get_mm_fields_config will re-pad all fields to the
# batch's max length at batch time, ensuring consistent
# padding regardless of cache history.
masks = processed_outputs["input_features_mask"]
unpadded_features = [
f[mask]
for f, mask in zip(
processed_outputs["input_features"],
processed_outputs["input_features_mask"],
masks,
)
]
unpadded_masks = [mask[mask] for mask in masks]
processed_outputs["input_features"] = unpadded_features
processed_outputs["input_features_padded"] = unpadded_features
processed_outputs["input_features_mask"] = unpadded_masks
# Merge video outputs into the final result
combined_outputs = dict(processed_outputs, **video_outputs)
......@@ -848,7 +879,12 @@ class Gemma4MultimodalEmbedder(nn.Module):
info=Gemma4ProcessingInfo,
dummy_inputs=Gemma4DummyInputsBuilder,
)
class Gemma4ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
class Gemma4ForConditionalGeneration(
nn.Module,
SupportsMultiModal,
SupportsPP,
SupportsEagle3,
):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment