Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
...@@ -1577,6 +1577,22 @@ class VllmConfig: ...@@ -1577,6 +1577,22 @@ class VllmConfig:
compile_range_end, compile_range_end,
) )
if compilation_config.pass_config.fuse_minimax_qk_norm:
from vllm.compilation.passes.fusion.minimax_qk_norm_fusion import (
MAX_TOKEN_NUM,
)
max_token_num = min(
MAX_TOKEN_NUM, self.scheduler_config.max_num_batched_tokens
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_endpoints.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below MiniMax QK norm fusion threshold, "
"MiniMax QK norm fusion enabled for all num_tokens."
)
if compilation_config.compile_ranges_endpoints is not None: if compilation_config.compile_ranges_endpoints is not None:
for x in compilation_config.compile_ranges_endpoints: for x in compilation_config.compile_ranges_endpoints:
assert isinstance(x, int) assert isinstance(x, int)
......
...@@ -170,7 +170,8 @@ class AnthropicServingMessages(OpenAIServingChat): ...@@ -170,7 +170,8 @@ class AnthropicServingMessages(OpenAIServingChat):
else: else:
cls._convert_message_content(msg, openai_msg, openai_messages) cls._convert_message_content(msg, openai_msg, openai_messages)
openai_messages.append(openai_msg) if not (msg.role == "user" and "content" not in openai_msg):
openai_messages.append(openai_msg)
@classmethod @classmethod
def _convert_message_content( def _convert_message_content(
......
...@@ -372,6 +372,7 @@ async def init_app_state( ...@@ -372,6 +372,7 @@ async def init_app_state(
enable_auto_tools=args.enable_auto_tool_choice, enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser, tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs, default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack, log_error_stack=args.log_error_stack,
) )
...@@ -467,6 +468,7 @@ async def init_render_app_state( ...@@ -467,6 +468,7 @@ async def init_render_app_state(
enable_auto_tools=args.enable_auto_tool_choice, enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser, tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs, default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack, log_error_stack=args.log_error_stack,
) )
......
...@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None, default_template_kwargs=None,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=self.parser.tool_parser_cls if self.parser else None, tool_parser=self.parser.tool_parser_cls if self.parser else None,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
) )
return messages, engine_inputs return messages, engine_inputs
...@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None, default_template_kwargs=None,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=tool_parser, tool_parser=tool_parser,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
) )
return engine_inputs return engine_inputs
......
...@@ -44,6 +44,7 @@ from vllm.inputs import ( ...@@ -44,6 +44,7 @@ from vllm.inputs import (
) )
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.parser import ParserManager from vllm.parser import ParserManager
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.renderers import BaseRenderer, merge_kwargs from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs.preprocess import ( from vllm.renderers.inputs.preprocess import (
extract_prompt_components, extract_prompt_components,
...@@ -74,6 +75,7 @@ class OpenAIServingRender: ...@@ -74,6 +75,7 @@ class OpenAIServingRender:
enable_auto_tools: bool = False, enable_auto_tools: bool = False,
exclude_tools_when_tool_choice_none: bool = False, exclude_tools_when_tool_choice_none: bool = False,
tool_parser: str | None = None, tool_parser: str | None = None,
reasoning_parser: str | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None, default_chat_template_kwargs: dict[str, Any] | None = None,
log_error_stack: bool = False, log_error_stack: bool = False,
) -> None: ) -> None:
...@@ -94,6 +96,11 @@ class OpenAIServingRender: ...@@ -94,6 +96,11 @@ class OpenAIServingRender:
enable_auto_tools=enable_auto_tools, enable_auto_tools=enable_auto_tools,
model_name=model_config.model, model_name=model_config.model,
) )
self.reasoning_parser: type[ReasoningParser] | None = (
ParserManager.get_reasoning_parser(
reasoning_parser_name=reasoning_parser,
)
)
self.default_chat_template_kwargs: dict[str, Any] = ( self.default_chat_template_kwargs: dict[str, Any] = (
default_chat_template_kwargs or {} default_chat_template_kwargs or {}
) )
...@@ -245,6 +252,7 @@ class OpenAIServingRender: ...@@ -245,6 +252,7 @@ class OpenAIServingRender:
default_template_kwargs=self.default_chat_template_kwargs, default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts, tool_dicts=tool_dicts,
tool_parser=tool_parser, tool_parser=tool_parser,
reasoning_parser=self.reasoning_parser,
) )
else: else:
# For GPT-OSS. # For GPT-OSS.
...@@ -498,6 +506,9 @@ class OpenAIServingRender: ...@@ -498,6 +506,9 @@ class OpenAIServingRender:
default_template_kwargs: dict[str, Any] | None, default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None, tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: type[ToolParser] | None = None, tool_parser: type[ToolParser] | None = None,
reasoning_parser: type[ReasoningParser] | None = None,
*,
skip_mm_cache: bool = False,
) -> tuple[list[ConversationMessage], list[EngineInput]]: ) -> tuple[list[ConversationMessage], list[EngineInput]]:
"""Copied from OpenAIServing._preprocess_chat.""" """Copied from OpenAIServing._preprocess_chat."""
renderer = self.renderer renderer = self.renderer
...@@ -531,6 +542,10 @@ class OpenAIServingRender: ...@@ -531,6 +542,10 @@ class OpenAIServingRender:
}, },
) )
if reasoning_parser is not None:
tokenizer = renderer.get_tokenizer()
request = reasoning_parser(tokenizer).adjust_request(request=request)
# tool parsing is done only if a tool_parser has been set and if # tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM # is set, we want to prevent parsing a tool_call hallucinated by the LLM
......
This diff is collapsed.
...@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader): ...@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight') GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found or None if no mapping found
""" """
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py # Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix. # tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as # Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it. # gguf-py expects it.
if hf_name.startswith("language_model."): if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.' hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix # Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")): if hf_name.endswith((".weight", ".bias")):
......
This diff is collapsed.
This diff is collapsed.
...@@ -113,7 +113,29 @@ class KimiK25ProcessingInfo(BaseProcessingInfo): ...@@ -113,7 +113,29 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
trust_remote_code=self.ctx.model_config.trust_remote_code, trust_remote_code=self.ctx.model_config.trust_remote_code,
) )
self.media_token_id = media_token_id = hf_config.media_placeholder_token_id # Resolve token ID from the tokenizer because transformers v5
# may remap token IDs vs config.json.
config_token_id = hf_config.media_placeholder_token_id
resolved_token_id = tokenizer.convert_tokens_to_ids("<|media_pad|>")
is_valid_resolved = isinstance(resolved_token_id, int) and (
tokenizer.unk_token_id is None
or resolved_token_id != tokenizer.unk_token_id
)
if is_valid_resolved and resolved_token_id != config_token_id:
logger.warning_once(
"Kimi-K2.5 config.media_placeholder_token_id (%d) disagrees "
"with tokenizer mapping for <|media_pad|> (%d). "
"Using tokenizer value.",
config_token_id,
resolved_token_id,
)
media_token_id = resolved_token_id
# Patch config so downstream code also sees the correct ID.
hf_config.media_placeholder_token_id = resolved_token_id
else:
media_token_id = config_token_id
self.media_token_id = media_token_id
self.media_token = tokenizer.decode(media_token_id) self.media_token = tokenizer.decode(media_token_id)
self.image_processor = image_processor self.image_processor = image_processor
...@@ -232,8 +254,7 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo]) ...@@ -232,8 +254,7 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargsItems, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() media_token_id = self.info.media_token_id
media_token_id = hf_config.media_placeholder_token_id
def get_replacement(item_idx: int): def get_replacement(item_idx: int):
media = mm_items.get_items("vision_chunk", (VisionChunkProcessorItems,)) media = mm_items.get_items("vision_chunk", (VisionChunkProcessorItems,))
......
...@@ -232,9 +232,7 @@ class MiniMaxM2Attention(nn.Module): ...@@ -232,9 +232,7 @@ class MiniMaxM2Attention(nn.Module):
) -> torch.Tensor: ) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states) qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = MiniMaxText01RMSNormTP.forward_qk( q, k = MiniMaxText01RMSNormTP.forward_qk(self.q_norm, self.k_norm, q, k)
self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
)
q, k = self.rotary_emb(positions, q, k) q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v) attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output) output, _ = self.o_proj(attn_output)
......
...@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import ( ...@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
......
...@@ -470,6 +470,15 @@ class DelegatingParser(Parser): ...@@ -470,6 +470,15 @@ class DelegatingParser(Parser):
# No tool calls # No tool calls
return [], content return [], content
def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
if self._reasoning_parser is not None:
request = self._reasoning_parser.adjust_request(request)
if self._tool_parser is not None:
request = self._tool_parser.adjust_request(request)
return request
def extract_reasoning_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
......
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Callable, Iterable, Sequence from collections.abc import Callable, Iterable, Sequence
from functools import cached_property from functools import cached_property
from typing import TYPE_CHECKING from typing import TYPE_CHECKING, cast
from vllm.entrypoints.mcp.tool_server import ToolServer from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -150,6 +150,12 @@ class ReasoningParser: ...@@ -150,6 +150,12 @@ class ReasoningParser:
previously been parsed and extracted (see constructor) previously been parsed and extracted (see constructor)
""" """
def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Adjust request parameters; override in subclasses as needed."""
return request
def prepare_structured_tag( def prepare_structured_tag(
self, self,
original_tag: str | None, original_tag: str | None,
...@@ -298,7 +304,7 @@ class ReasoningParserManager: ...@@ -298,7 +304,7 @@ class ReasoningParserManager:
if isinstance(name, str): if isinstance(name, str):
names = [name] names = [name]
elif is_list_of(name, str): elif is_list_of(name, str):
names = name names = cast(list[str], name)
else: else:
names = [class_name] names = [class_name]
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment