Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
......@@ -1577,6 +1577,22 @@ class VllmConfig:
compile_range_end,
)
if compilation_config.pass_config.fuse_minimax_qk_norm:
from vllm.compilation.passes.fusion.minimax_qk_norm_fusion import (
MAX_TOKEN_NUM,
)
max_token_num = min(
MAX_TOKEN_NUM, self.scheduler_config.max_num_batched_tokens
)
if compile_range_end is not None and max_token_num < compile_range_end:
computed_compile_ranges_endpoints.append(max_token_num)
else:
logger.debug(
"Max num batched tokens below MiniMax QK norm fusion threshold, "
"MiniMax QK norm fusion enabled for all num_tokens."
)
if compilation_config.compile_ranges_endpoints is not None:
for x in compilation_config.compile_ranges_endpoints:
assert isinstance(x, int)
......
......@@ -170,7 +170,8 @@ class AnthropicServingMessages(OpenAIServingChat):
else:
cls._convert_message_content(msg, openai_msg, openai_messages)
openai_messages.append(openai_msg)
if not (msg.role == "user" and "content" not in openai_msg):
openai_messages.append(openai_msg)
@classmethod
def _convert_message_content(
......
......@@ -372,6 +372,7 @@ async def init_app_state(
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
......@@ -467,6 +468,7 @@ async def init_render_app_state(
enable_auto_tools=args.enable_auto_tool_choice,
exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none,
tool_parser=args.tool_call_parser,
reasoning_parser=args.structured_outputs_config.reasoning_parser,
default_chat_template_kwargs=args.default_chat_template_kwargs,
log_error_stack=args.log_error_stack,
)
......
......@@ -594,6 +594,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None,
tool_dicts=tool_dicts,
tool_parser=self.parser.tool_parser_cls if self.parser else None,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
)
return messages, engine_inputs
......@@ -618,6 +619,7 @@ class OpenAIServingResponses(OpenAIServing):
default_template_kwargs=None,
tool_dicts=tool_dicts,
tool_parser=tool_parser,
reasoning_parser=self.parser.reasoning_parser_cls if self.parser else None,
)
return engine_inputs
......
......@@ -44,6 +44,7 @@ from vllm.inputs import (
)
from vllm.logger import init_logger
from vllm.parser import ParserManager
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs.preprocess import (
extract_prompt_components,
......@@ -74,6 +75,7 @@ class OpenAIServingRender:
enable_auto_tools: bool = False,
exclude_tools_when_tool_choice_none: bool = False,
tool_parser: str | None = None,
reasoning_parser: str | None = None,
default_chat_template_kwargs: dict[str, Any] | None = None,
log_error_stack: bool = False,
) -> None:
......@@ -94,6 +96,11 @@ class OpenAIServingRender:
enable_auto_tools=enable_auto_tools,
model_name=model_config.model,
)
self.reasoning_parser: type[ReasoningParser] | None = (
ParserManager.get_reasoning_parser(
reasoning_parser_name=reasoning_parser,
)
)
self.default_chat_template_kwargs: dict[str, Any] = (
default_chat_template_kwargs or {}
)
......@@ -245,6 +252,7 @@ class OpenAIServingRender:
default_template_kwargs=self.default_chat_template_kwargs,
tool_dicts=tool_dicts,
tool_parser=tool_parser,
reasoning_parser=self.reasoning_parser,
)
else:
# For GPT-OSS.
......@@ -498,6 +506,9 @@ class OpenAIServingRender:
default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: type[ToolParser] | None = None,
reasoning_parser: type[ReasoningParser] | None = None,
*,
skip_mm_cache: bool = False,
) -> tuple[list[ConversationMessage], list[EngineInput]]:
"""Copied from OpenAIServing._preprocess_chat."""
renderer = self.renderer
......@@ -531,6 +542,10 @@ class OpenAIServingRender:
},
)
if reasoning_parser is not None:
tokenizer = renderer.get_tokenizer()
request = reasoning_parser(tokenizer).adjust_request(request=request)
# tool parsing is done only if a tool_parser has been set and if
# tool_choice is not "none" (if tool_choice is "none" but a tool_parser
# is set, we want to prevent parsing a tool_call hallucinated by the LLM
......
This diff is collapsed.
......@@ -209,12 +209,24 @@ class GGUFModelLoader(BaseModelLoader):
GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
or None if no mapping found
"""
# In transformers v5, multimodal models (e.g. Gemma3) wrap
# all sub-models under an outer 'model.' attribute, producing
# state_dict keys like 'model.language_model.layers.0...' and
# 'model.vision_tower.vision_model...'. Strip this outer
# prefix so the keys match what gguf-py expects.
if is_multimodal and hf_name.startswith("model."):
hf_name = hf_name[6:] # Remove outer 'model.'
# Strip 'language_model.' prefix for multimodal models - gguf-py
# tensor mappings expect parameter names without this prefix.
# Note: 'model.' prefix should be KEPT for text-only models as
# gguf-py expects it.
if hf_name.startswith("language_model."):
hf_name = hf_name[15:] # Remove 'language_model.'
# Re-add 'model.' prefix because gguf-py text tensor maps
# expect 'model.layers...' format.
if is_multimodal:
hf_name = "model." + hf_name
# Parse parameter name and suffix
if hf_name.endswith((".weight", ".bias")):
......
This diff is collapsed.
This diff is collapsed.
......@@ -113,7 +113,29 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
trust_remote_code=self.ctx.model_config.trust_remote_code,
)
self.media_token_id = media_token_id = hf_config.media_placeholder_token_id
# Resolve token ID from the tokenizer because transformers v5
# may remap token IDs vs config.json.
config_token_id = hf_config.media_placeholder_token_id
resolved_token_id = tokenizer.convert_tokens_to_ids("<|media_pad|>")
is_valid_resolved = isinstance(resolved_token_id, int) and (
tokenizer.unk_token_id is None
or resolved_token_id != tokenizer.unk_token_id
)
if is_valid_resolved and resolved_token_id != config_token_id:
logger.warning_once(
"Kimi-K2.5 config.media_placeholder_token_id (%d) disagrees "
"with tokenizer mapping for <|media_pad|> (%d). "
"Using tokenizer value.",
config_token_id,
resolved_token_id,
)
media_token_id = resolved_token_id
# Patch config so downstream code also sees the correct ID.
hf_config.media_placeholder_token_id = resolved_token_id
else:
media_token_id = config_token_id
self.media_token_id = media_token_id
self.media_token = tokenizer.decode(media_token_id)
self.image_processor = image_processor
......@@ -232,8 +254,7 @@ class KimiK25MultiModalProcessor(BaseMultiModalProcessor[KimiK25ProcessingInfo])
hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
media_token_id = hf_config.media_placeholder_token_id
media_token_id = self.info.media_token_id
def get_replacement(item_idx: int):
media = mm_items.get_items("vision_chunk", (VisionChunkProcessorItems,))
......
......@@ -232,9 +232,7 @@ class MiniMaxM2Attention(nn.Module):
) -> torch.Tensor:
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = MiniMaxText01RMSNormTP.forward_qk(
self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
)
q, k = MiniMaxText01RMSNormTP.forward_qk(self.q_norm, self.k_norm, q, k)
q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v)
output, _ = self.o_proj(attn_output)
......
......@@ -32,9 +32,9 @@ from transformers.models.musicflamingo import (
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs import MultiModalDataDict
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (
MultiModalDataDict,
MultiModalFieldConfig,
MultiModalKwargsItems,
)
......
......@@ -470,6 +470,15 @@ class DelegatingParser(Parser):
# No tool calls
return [], content
def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
if self._reasoning_parser is not None:
request = self._reasoning_parser.adjust_request(request)
if self._tool_parser is not None:
request = self._tool_parser.adjust_request(request)
return request
def extract_reasoning_streaming(
self,
previous_text: str,
......
......@@ -6,7 +6,7 @@ import os
from abc import abstractmethod
from collections.abc import Callable, Iterable, Sequence
from functools import cached_property
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, cast
from vllm.entrypoints.mcp.tool_server import ToolServer
from vllm.logger import init_logger
......@@ -150,6 +150,12 @@ class ReasoningParser:
previously been parsed and extracted (see constructor)
"""
def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Adjust request parameters; override in subclasses as needed."""
return request
def prepare_structured_tag(
self,
original_tag: str | None,
......@@ -298,7 +304,7 @@ class ReasoningParserManager:
if isinstance(name, str):
names = [name]
elif is_list_of(name, str):
names = name
names = cast(list[str], name)
else:
names = [class_name]
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment