Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
...@@ -1577,6 +1577,22 @@ class VllmConfig: ...@@ -1577,6 +1577,22 @@ class VllmConfig:
compile_range_end, compile_range_end,
) )
if compilation_config.pass_config.fuse_minimax_qk_norm:
from vllm.compilation.passes.fusion.minimax_qk_norm_fusion import (
MAX_TOKEN_NUM,
)
max_token_num = min(
MAX_TOKEN_NUM, self.scheduler_config.max_num_batched_tokens
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_endpoints.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below MiniMax QK norm fusion threshold, "
"MiniMax QK norm fusion enabled for all num_tokens."
)
if compilation_config.compile_ranges_endpoints is not None: if compilation_config.compile_ranges_endpoints is not None:
for x in compilation_config.compile_ranges_endpoints: for x in compilation_config.compile_ranges_endpoints:
assert isinstance(x, int) assert isinstance(x, int)
......
...@@ -170,6 +170,7 @@ class AnthropicServingMessages(OpenAIServingChat): ...@@ -170,6 +170,7 @@ class AnthropicServingMessages(OpenAIServingChat):
else: else:
cls._convert_message_content(msg, openai_msg, openai_messages) cls._convert_message_content(msg, openai_msg, openai_messages)
if not (msg.role == "user" and "content" not in openai_msg):
openai_messages.append(openai_msg) openai_messages.append(openai_msg)
@classmethod @classmethod
......
...@@ -372,6 +372,7 @@ async def init_app_state( ...@@ -372,6 +372,7 @@ async def init_app_state(
enable_auto_tools=args.enable_auto_tool_choice, enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser, tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs, default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack, log_error_stack=args.log_error_stack,
) )
...@@ -467,6 +468,7 @@ async def init_render_app_state( ...@@ -467,6 +468,7 @@ async def init_render_app_state(
enable_auto_tools=args.enable_auto_tool_choice, enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser, tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs, default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack, log_error_stack=args.log_error_stack,
) )
......
...@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None, default_template_kwargs=None,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=self.parser.tool_parser_cls if self.parser else None, tool_parser=self.parser.tool_parser_cls if self.parser else None,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
) )
return messages, engine_inputs return messages, engine_inputs
...@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None, default_template_kwargs=None,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=tool_parser, tool_parser=tool_parser,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
) )
return engine_inputs return engine_inputs
......
...@@ -44,6 +44,7 @@ from vllm.inputs import ( ...@@ -44,6 +44,7 @@ from vllm.inputs import (
) )
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.parser import ParserManager from vllm.parser import ParserManager
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.renderers import BaseRenderer, merge_kwargs from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs.preprocess import ( from vllm.renderers.inputs.preprocess import (
extract_prompt_components, extract_prompt_components,
...@@ -74,6 +75,7 @@ class OpenAIServingRender: ...@@ -74,6 +75,7 @@ class OpenAIServingRender:
enable_auto_tools: bool = False, enable_auto_tools: bool = False,
exclude_tools_when_tool_choice_none: bool = False, exclude_tools_when_tool_choice_none: bool = False,
tool_parser: str | None = None, tool_parser: str | None = None,
reasoning_parser: str | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None, default_chat_template_kwargs: dict[str, Any] | None = None,
log_error_stack: bool = False, log_error_stack: bool = False,
) -> None: ) -> None:
...@@ -94,6 +96,11 @@ class OpenAIServingRender: ...@@ -94,6 +96,11 @@ class OpenAIServingRender:
enable_auto_tools=enable_auto_tools, enable_auto_tools=enable_auto_tools,
model_name=model_config.model, model_name=model_config.model,
) )
self.reasoning_parser: type[ReasoningParser] | None = (
ParserManager.get_reasoning_parser(
reasoning_parser_name=reasoning_parser,
)
)
self.default_chat_template_kwargs: dict[str, Any] = ( self.default_chat_template_kwargs: dict[str, Any] = (
default_chat_template_kwargs or {} default_chat_template_kwargs or {}
) )
...@@ -245,6 +252,7 @@ class OpenAIServingRender: ...@@ -245,6 +252,7 @@ class OpenAIServingRender:
default_template_kwargs=self.default_chat_template_kwargs, default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=tool_parser, tool_parser=tool_parser,
reasoning_parser=self.reasoning_parser,
) )
else: else:
# For GPT-OSS. # For GPT-OSS.
...@@ -498,6 +506,9 @@ class OpenAIServingRender: ...@@ -498,6 +506,9 @@ class OpenAIServingRender:
default_template_kwargs: dict[str, Any] | None, default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None, tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: type[ToolParser] | None = None, tool_parser: type[ToolParser] | None = None,
reasoning_parser: type[ReasoningParser] | None = None,
*,
skip_mm_cache: bool = False,
) -> tuple[list[ConversationMessage], list[EngineInput]]: ) -> tuple[list[ConversationMessage], list[EngineInput]]:
"""Copied from OpenAIServing._preprocess_chat.""" """Copied from OpenAIServing._preprocess_chat."""
renderer = self.renderer renderer = self.renderer
...@@ -531,6 +542,10 @@ class OpenAIServingRender: ...@@ -531,6 +542,10 @@ class OpenAIServingRender:
}, },
) )
if reasoning_parser is not None:
tokenizer = renderer.get_tokenizer()
request = reasoning_parser(tokenizer).adjust_request(request=request)
# tool parsing is done only if a tool_parser has been set and if # tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM # is set, we want to prevent parsing a tool_call hallucinated by the LLM
......
This diff is collapsed.
...@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader): ...@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight') GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found or None if no mapping found
""" """
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py # Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix. # tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as # Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it. # gguf-py expects it.
if hf_name.startswith("language_model."): if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.' hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix # Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")): if hf_name.endswith((".weight", ".bias")):
......
This diff is collapsed.
...@@ -65,7 +65,12 @@ from vllm.multimodal.processing.processor import ( ...@@ -65,7 +65,12 @@ from vllm.multimodal.processing.processor import (
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import (
MultiModalEmbeddings,
SupportsEagle3,
SupportsMultiModal,
SupportsPP,
)
from .utils import ( from .utils import (
AutoWeightsLoader, AutoWeightsLoader,
WeightsMapper, WeightsMapper,
...@@ -121,8 +126,12 @@ class Gemma4AudioInputs(TensorSchema): ...@@ -121,8 +126,12 @@ class Gemma4AudioInputs(TensorSchema):
""" """
type: Literal["audio"] = "audio" type: Literal["audio"] = "audio"
input_features_padded: Annotated[torch.Tensor, TensorShape("bn", "s", "f")] input_features_padded: Annotated[
input_features_mask: Annotated[torch.Tensor, TensorShape("bn", "s")] torch.Tensor, TensorShape("bn", "s", "f", dynamic_dims={"s"})
]
input_features_mask: Annotated[
torch.Tensor, TensorShape("bn", "s", dynamic_dims={"s"})
]
Gemma4ImageInputs = Gemma4ImagePixelInputs Gemma4ImageInputs = Gemma4ImagePixelInputs
...@@ -163,9 +172,14 @@ class Gemma4ProcessingInfo(BaseProcessingInfo): ...@@ -163,9 +172,14 @@ class Gemma4ProcessingInfo(BaseProcessingInfo):
Setting ``add_special_tokens=False`` here prevents the duplicate and Setting ``add_special_tokens=False`` here prevents the duplicate and
ensures both ``llm.generate()`` and the chat/completions API behave ensures both ``llm.generate()`` and the chat/completions API behave
correctly. correctly for IT models. For PT models (without chat template), we
keep the default (True) to ensure BOS is added for raw prompts.
""" """
tokenizer = self.ctx.get_tokenizer()
has_chat_template = getattr(tokenizer, "chat_template", None) is not None
params = super().get_default_tok_params() params = super().get_default_tok_params()
if has_chat_template:
params = params.with_kwargs(add_special_tokens=False) params = params.with_kwargs(add_special_tokens=False)
return params return params
...@@ -503,6 +517,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]): ...@@ -503,6 +517,8 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video: list[list[float]] = [] video_timestamps_per_video: list[list[float]] = []
video_frame_counts: list[int] = [] video_frame_counts: list[int] = []
video_replacements: list[str] = []
for item in videos: for item in videos:
video_array, metadata = item video_array, metadata = item
...@@ -555,10 +571,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]): ...@@ -555,10 +571,7 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
video_timestamps_per_video.append(timestamps) video_timestamps_per_video.append(timestamps)
video_frame_counts.append(len(frames)) video_frame_counts.append(len(frames))
# Build expanded replacement text and replace the # Build expanded replacement text for this video.
# <|video|> placeholder in the prompt.
# Use split(token, 1) to avoid collision — the
# replacement text itself contains <|video|> tokens.
ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps] ts_strs = [f"{int(s // 60):02d}:{int(s % 60):02d}" for s in timestamps]
replacement = " ".join( replacement = " ".join(
f"{t} {processor.boi_token}" f"{t} {processor.boi_token}"
...@@ -566,9 +579,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]): ...@@ -566,9 +579,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
f"{processor.eoi_token}" f"{processor.eoi_token}"
for t, n in zip(ts_strs, num_soft_per_frame) for t, n in zip(ts_strs, num_soft_per_frame)
) )
parts = prompt.split(processor.video_token, 1) video_replacements.append(replacement)
if len(parts) == 2:
prompt = parts[0] + replacement + parts[1] # Replace all <|video|> placeholders at once. We split on
# video_token to get N+1 parts, then interleave with the
# N replacement strings. This avoids the iterative
# split-replace bug where replacement text (which itself
# contains <|video|> tokens) collides with later splits.
vt = processor.video_token
parts = prompt.split(vt, len(video_replacements))
# NOTE: len(parts) <= len(video_replacements) + 1
parts_with_repl: list[str] = []
for part, repl in zip(parts, video_replacements):
parts_with_repl.extend([part, repl])
parts_with_repl.extend(parts[len(video_replacements) :])
prompt = "".join(parts_with_repl)
video_outputs = { video_outputs = {
"pixel_values_videos": torch.cat(all_video_pixel_values, dim=0), "pixel_values_videos": torch.cat(all_video_pixel_values, dim=0),
...@@ -631,19 +658,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]): ...@@ -631,19 +658,23 @@ class Gemma4MultiModalProcessor(BaseMultiModalProcessor[Gemma4ProcessingInfo]):
) )
if "input_features" in processed_outputs: if "input_features" in processed_outputs:
# Keep padded features for batched audio tower execution. # Unpad per-item so each item's cache entry is
processed_outputs["input_features_padded"] = processed_outputs[ # self-contained. The batched() field config in
"input_features" # _get_mm_fields_config will re-pad all fields to the
] # batch's max length at batch time, ensuring consistent
# Unpad per-item so each item's cache entry is self-contained. # padding regardless of cache history.
masks = processed_outputs["input_features_mask"]
unpadded_features = [ unpadded_features = [
f[mask] f[mask]
for f, mask in zip( for f, mask in zip(
processed_outputs["input_features"], processed_outputs["input_features"],
processed_outputs["input_features_mask"], masks,
) )
] ]
unpadded_masks = [mask[mask] for mask in masks]
processed_outputs["input_features"] = unpadded_features processed_outputs["input_features"] = unpadded_features
processed_outputs["input_features_padded"] = unpadded_features
processed_outputs["input_features_mask"] = unpadded_masks
# Merge video outputs into the final result # Merge video outputs into the final result
combined_outputs = dict(processed_outputs, **video_outputs) combined_outputs = dict(processed_outputs, **video_outputs)
...@@ -848,7 +879,12 @@ class Gemma4MultimodalEmbedder(nn.Module): ...@@ -848,7 +879,12 @@ class Gemma4MultimodalEmbedder(nn.Module):
info=Gemma4ProcessingInfo, info=Gemma4ProcessingInfo,
dummy_inputs=Gemma4DummyInputsBuilder, dummy_inputs=Gemma4DummyInputsBuilder,
) )
class Gemma4ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class Gemma4ForConditionalGeneration(
nn.Module,
SupportsMultiModal,
SupportsPP,
SupportsEagle3,
):
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment