Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

8d75f22e · zhuwenwen · ce888aa4 · 7d80c73d · 8d75f22e · 8d75f22e
Commit 8d75f22e authored Dec 13, 2025 by zhuwenwen
20 changed files
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = {
        "granite_reasoning_parser",
        "GraniteReasoningParser",
    ),
+    "holo2": (
+        "holo2_reasoning_parser",
+        "Holo2ReasoningParser",
+    ),
    "hunyuan_a13b": (
        "hunyuan_a13b_reasoning_parser",
        "HunyuanA13BReasoningParser",

--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -63,6 +63,31 @@ class ReasoningParser:
            True if the reasoning content ends in the input_ids.
        """

+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids on a
+        decode step.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output during a decode step.
+        `input_ids` the entire model output and `delta_ids` are the last few
+        computed tokens of the model output (like during a decode step).
+
+        Parameters:
+        input_ids: list[int]
+            The entire model output.
+        delta_ids: list[int]
+            The last few computed tokens of the model output at the current decode step.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the `delta_ids` on a
+            decode step.
+        """
+        return self.is_reasoning_end(input_ids)
+
    @abstractmethod
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """
@@ -121,7 +146,7 @@ class ReasoningParser:
        self,
        original_tag: str | None,
        tool_server: ToolServer | None,
-    ) -> str:
+    ) -> str | None:
        """
        Instance method that is implemented for preparing the structured tag
        Otherwise, None is returned
@@ -160,7 +185,10 @@ class ReasoningParserManager:
        if name in cls.lazy_parsers:
            return cls._load_lazy_parser(name)

-        raise KeyError(f"Reasoning parser '{name}' not found.")
+        registered = ", ".join(cls.list_registered())
+        raise KeyError(
+            f"Reasoning parser '{name}' not found. Available parsers: {registered}"
+        )

    @classmethod
    def list_registered(cls) -> list[str]:

--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -64,8 +64,21 @@ class BaseThinkingReasoningParser(ReasoningParser):
            )

    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        start_token_id = self.start_token_id
        end_token_id = self.end_token_id
-        return any(input_id == end_token_id for input_id in reversed(input_ids))
+
+        for i in range(len(input_ids) - 1, -1, -1):
+            if input_ids[i] == start_token_id:
+                return False
+            if input_ids[i] == end_token_id:
+                return True
+        return False
+
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        end_token_id = self.end_token_id
+        return end_token_id in delta_ids

    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        """

--- a/vllm/reasoning/deepseek_v3_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
        return self._parser.is_reasoning_end(input_ids)

+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        return self._parser.extract_content_ids(input_ids)


--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -5,7 +5,7 @@ from collections.abc import Sequence

 from transformers import PreTrainedTokenizerBase

-from vllm.entrypoints.harmony_utils import parse_chat_output
+from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
@@ -145,7 +145,7 @@ class GptOssReasoningParser(ReasoningParser):
    # This function prepares the structural tag to format reasoning output
    def prepare_structured_tag(
        self, original_tag: str | None, tool_server: ToolServer | None
-    ) -> str:
+    ) -> str | None:
        if original_tag is None:
            if tool_server is None:
                return json.dumps(no_func_reaonsing_tag)

--- a/vllm/reasoning/holo2_reasoning_parser.py
+++ b/vllm/reasoning/holo2_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import (
+    ReasoningParser,
+)
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
+from vllm.tokenizers import TokenizerLike
+
+logger = init_logger(__name__)
+
+
+class Holo2ReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for the Holo2 models which are based on Qwen3.
+
+    The Holo2 model uses <think>...</think> tokens to denote reasoning text but <think>
+    is part of the chat template. This parser extracts the reasoning content until
+    </think> in the model's output.
+
+    The model provides a switch to enable or disable reasoning
+    output via the 'thinking=False' parameter.
+
+    Chat template args:
+    - thinking: Whether to enable reasoning output (default: True)
+
+
+    Parsing rules on model output:
+        - thinking == False
+            -> Model output is treated as purely the content |content|
+        - thinking == True
+            -> Model output is |reasoning_content|</think>|content|
+    """
+
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
+        # Deepseek V3 and Holo2 are similar. However, Holo2 models think by default.
+        # this parser without user specified chat template args is initiated once for
+        # all requests in the structured output manager. So it is important that without
+        # user specified chat template args, the default thinking is True.
+
+        enable_thinking = bool(chat_kwargs.get("thinking", True))
+
+        if enable_thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning(model_output, request)
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
--- a/vllm/reasoning/identity_reasoning_parser.py
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
        # Always return True, since we never treat reasoning specially
        return True

+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return True
+
    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
        # Identity: return all tokens as content
        return input_ids

--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from .deepseekv32 import DeepseekV32Tokenizer
 from .hf import HfTokenizer
 from .mistral import MistralTokenizer
 from .protocol import TokenizerLike
@@ -21,4 +22,5 @@ __all__ = [
    "get_tokenizer",
    "cached_tokenizer_from_config",
    "init_tokenizer_from_config",
+    "DeepseekV32Tokenizer",
 ]
--- a/vllm/tokenizers/deepseek_v32_encoding.py
+++ b/vllm/tokenizers/deepseek_v32_encoding.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+import copy
+import json
+from typing import Any
+
+import regex as re
+
+# flake8: noqa: E501
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning_content}"
+
+response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+tool_call_template: str = (
+    '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+)
+tool_calls_template = (
+    "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+)
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except Exception:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+    if isinstance(tool_call["arguments"], str):
+        arguments = json.loads(tool_call["arguments"])
+    else:
+        arguments = tool_call["arguments"]
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: dict[str, tuple[str, str]]
+) -> dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join(
+            [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
+        )
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: list[dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(
+    index: int, messages: list[dict[str, Any]], thinking_mode: str
+) -> str:
+    assert 0 <= index < len(messages)
+    assert thinking_mode in ["chat", "thinking"], (
+        f"Invalid thinking_mode `{thinking_mode}`"
+    )
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+    elif role == "developer":
+        assert content, f"Invalid message for role `{role}`: {msg}"
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        assert (
+            index == 0
+            or prev_assistant_idx >= 0
+            and assistant_msg.get("role") == "assistant"
+        ), f"Invalid messages at {index}:\n{assistant_msg}"
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
+            "No tool calls but found tool output"
+        )
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            assert reasoning_content or tool_calls, (
+                f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
+            )
+            thinking_part = (
+                thinking_template.format(reasoning_content=reasoning_content or "")
+                + thinking_end_token
+            )
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: list[dict[str, Any]], last_user_idx: int | None = None
+) -> list[dict[str, Any]]:
+    messages_wo_thinking: list[dict[str, Any]] = []
+    last_user_idx = (
+        find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    )
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning_content", None)
+            msg_wo_thinking.pop("reasoning", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: list[dict[str, Any]],
+    thinking_mode: str,
+    context: list[dict[str, Any]] | None = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(
+            idx + len(context), full_messages, thinking_mode=thinking_mode
+        )
+
+    return prompt
+
+
+def _read_until_stop(
+    index: int, text: str, stop: list[str]
+) -> tuple[int, str, None | str]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: list[dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        assert _ == ">\n", "Tool call format error"
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        assert stop_token is not None, "Missing special token"
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(
+            r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
+        )
+        assert len(p_tool_name) == 1, "Tool name format error"
+        tool_name = p_tool_name[0]
+
+        tool_args: dict[str, tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$',
+                param_content,
+                flags=re.DOTALL,
+            )
+            assert len(param_kv) == 1, "Parameter format error"
+            param_name, string, param_value = param_kv[0]
+
+            assert param_name not in tool_args, "Duplicate parameter name"
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            assert content == ">\n", "Parameter format error"
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly
+# formatted string and will not attempt to correct malformed output
+# that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning_content = content_delta
+        assert stop_token == thinking_end_token, "Invalid thinking format"
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        assert stop_token == eos_token, "Invalid summary format"
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        assert not tool_ends_text, "Unexpected content after tool calls"
+
+    assert len(text) == index and stop_token in [eos_token, None], (
+        "Unexpected content at end"
+    )
+
+    for sp_token in [
+        bos_token,
+        eos_token,
+        thinking_start_token,
+        thinking_end_token,
+        dsml_token,
+    ]:
+        assert sp_token not in summary_content and sp_token not in reasoning_content, (
+            "Unexpected special token in content"
+        )
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning_content": reasoning_content,
+        "reasoning": reasoning_content,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+from transformers import BatchEncoding
+
+from .deepseek_v32_encoding import encode_messages
+from .hf import HfTokenizer, TokenizerLike
+from .registry import TokenizerRegistry
+
+
+@TokenizerRegistry.register("deepseek_v32")
+class DeepseekV32Tokenizer(HfTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
+        self.tokenizer = tokenizer
+        self.name_or_path = (
+            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        tokenizer = super().from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
+        return DeepseekV32Tokenizer(tokenizer)
+
+    def apply_chat_template(self, messages, tools=None, **kwargs):
+        thinking = kwargs.get("thinking", False)
+        thinking_mode = "thinking"
+        if not thinking:
+            thinking_mode = "chat"
+        conversation = kwargs.get("conversation", messages)
+        messages = conversation.copy()
+        drop_thinking = True
+        if tools is not None and len(tools) > 0:
+            messages.insert(0, {"role": "system"})
+            messages[0]["tools"] = tools
+            drop_thinking = False
+        encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
+        prompt_str = encode_messages(messages, **encode_config)  # type: ignore
+        return prompt_str
+
+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        return self.tokenizer.all_special_tokens
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        return self.tokenizer.all_special_ids
+
+    @property
+    def bos_token_id(self) -> int:
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_token_id
+
+    @property
+    def is_fast(self) -> bool:
+        return self.tokenizer.is_fast
+
+    @property
+    def vocab_size(self) -> int:
+        return self.tokenizer.vocab_size
+
+    @property
+    def max_token_id(self) -> int:
+        return self.tokenizer.max_token_id
+
+    @property
+    def truncation_side(self) -> str:
+        return self.tokenizer.truncation_side
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        # </think> is an added token in DeepseekV32 tokenizer
+        return self.vocab_size + len(self.get_added_vocab())
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        return self.tokenizer(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            truncation=truncation,
+            max_length=max_length,
+        )
+
+    def get_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_vocab()
+
+    def get_added_vocab(self) -> dict[str, int]:
+        return self.tokenizer.get_added_vocab()
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        return self.tokenizer.encode(
+            text,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+        )
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        return self.tokenizer.convert_ids_to_tokens(
+            ids, skip_special_tokens=skip_special_tokens
+        )
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -14,13 +14,19 @@ if TYPE_CHECKING:
    )
    from mistral_common.tokens.tokenizers.tekken import Tekkenizer
    from transformers import BatchEncoding
-    from transformers.tokenization_mistral_common import (
-        MistralCommonTokenizer as TransformersMistralTokenizer,
-    )

    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
    from vllm.entrypoints.openai.protocol import ChatCompletionRequest

+    try:
+        # Transformers v5
+        from transformers.tokenization_mistral_common import MistralCommonBackend
+    except ImportError:
+        # Transformers v4
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as MistralCommonBackend,
+        )
+
 logger = init_logger(__name__)


@@ -208,11 +214,17 @@ class MistralTokenizer(TokenizerLike):
        **kwargs,
    ) -> "MistralTokenizer":
        from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )

-        tokenizer = TransformersMistralTokenizer.from_pretrained(
+        try:
+            # Transformers v5
+            from transformers.tokenization_mistral_common import MistralCommonBackend
+        except ImportError:
+            # Transformers v4
+            from transformers.tokenization_mistral_common import (
+                MistralCommonTokenizer as MistralCommonBackend,
+            )
+
+        tokenizer = MistralCommonBackend.from_pretrained(
            path_or_repo_id,
            *args,
            mode=ValidationMode.test,
@@ -223,7 +235,7 @@ class MistralTokenizer(TokenizerLike):

        return cls(tokenizer)

-    def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+    def __init__(self, tokenizer: "MistralCommonBackend") -> None:
        super().__init__()

        from mistral_common.protocol.instruct.validator import ValidationMode
@@ -297,6 +309,9 @@ class MistralTokenizer(TokenizerLike):
            for i in all_special_ids
        ]

+    def num_special_tokens_to_add(self) -> int:
+        return len(self.encode(""))
+
    # the following attributes are set to fit vLLM's design and are used
    # by the structured output backends.
    @property
@@ -409,6 +424,7 @@ class MistralTokenizer(TokenizerLike):
    ) -> list[int]:
        add_generation_prompt = kwargs.pop("add_generation_prompt", False)
        continue_final_message = kwargs.get("continue_final_message", False)
+        tokenize = kwargs.get("tokenize", True)
        padding = kwargs.get("padding", False)
        truncation = kwargs.get("truncation", False)
        max_length = kwargs.get("max_length")
@@ -421,7 +437,7 @@ class MistralTokenizer(TokenizerLike):
            conversation=messages,
            tools=tools,
            continue_final_message=continue_final_message,
-            tokenize=True,
+            tokenize=tokenize,
            padding=padding,
            truncation=truncation,
            max_length=max_length,

--- a/vllm/tokenizers/protocol.py
+++ b/vllm/tokenizers/protocol.py
@@ -22,6 +22,9 @@ class TokenizerLike(Protocol):
    ) -> "TokenizerLike":
        raise NotImplementedError

+    def num_special_tokens_to_add(self) -> int:
+        raise NotImplementedError
+
    @property
    def all_special_tokens(self) -> list[str]:
        raise NotImplementedError

--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -11,14 +11,14 @@ from typing_extensions import assert_never

 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
-from vllm.transformers_utils.repo_utils import list_filtered_repo_files
-from vllm.transformers_utils.utils import (
+from vllm.transformers_utils.gguf_utils import (
    check_gguf_file,
+    get_gguf_file_path_from_hf,
    is_gguf,
    is_remote_gguf,
    split_remote_gguf,
 )
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 from vllm.utils.import_utils import resolve_obj_by_qualname

 from .protocol import TokenizerLike
@@ -183,7 +183,7 @@ def get_tokenizer(
            "`tokenizer_mode='custom'` when initializing vLLM.",
            tokenizer_args,
            str(tokenizer_kwargs),
-            tokenizer_mode,
+            tokenizer_name,
        )

        tokenizer_mode = str(tokenizer_name)

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME

 from vllm import envs
 from vllm.logger import init_logger
+from vllm.transformers_utils.utils import parse_safetensors_file_metadata

 from .config_parser_base import ConfigParserBase
+from .gguf_utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
 from .repo_utils import (
    _get_hf_token,
    file_or_path_exists,
@@ -36,13 +43,6 @@ from .repo_utils import (
    try_get_local_file,
    with_retry,
 )
-from .utils import (
-    check_gguf_file,
-    is_gguf,
-    is_remote_gguf,
-    parse_safetensors_file_metadata,
-    split_remote_gguf,
-)

 if envs.VLLM_USE_MODELSCOPE:
    from modelscope import AutoConfig
@@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    step3_text="Step3TextConfig",
    qwen3_next="Qwen3NextConfig",
    lfm2_moe="Lfm2MoeConfig",
+    tarsier2="Tarsier2Config",
 )

 _CONFIG_ATTRS_MAPPING: dict[str, str] = {
@@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase):
                if config_dict.get("speculators_config") is not None
                else model_type
            )
+        # Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY
+        if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None:
+            model_type = hf_overrides.get("model_type", model_type)

        if model_type in _CONFIG_REGISTRY:
            config_class = _CONFIG_REGISTRY[model_type]
@@ -300,17 +304,31 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No

 def patch_rope_parameters(config: PretrainedConfig) -> None:
    """Provide backwards compatibility for RoPE."""
+    from vllm.config.utils import getattr_iter
+
+    rope_theta_names = ("rope_theta", "rotary_emb_base")
+    rope_theta = getattr_iter(config, rope_theta_names, None)
    if Version(version("transformers")) < Version("5.0.0.dev0"):
        # Transformers v4 installed, legacy config fields may be present
        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
            config.rope_parameters = rope_scaling
-        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+        if rope_theta is not None:
            if not hasattr(config, "rope_parameters"):
                config.rope_parameters = {"rope_type": "default"}
            config.rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
+        partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
+        if partial_rotary_factor is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+    elif rope_theta is not None or hasattr(config, "rope_parameters"):
+        # Transformers v5 installed
+        config.standardize_rope_params()
+        config.validate_rope()

    # No RoPE parameters to patch
-    if not hasattr(config, "rope_parameters"):
+    if getattr(config, "rope_parameters", None) is None:
        return

    # Add original_max_position_embeddings if present
@@ -351,7 +369,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
        rope_parameters["rope_type"] = "longrope"
        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
    elif rope_parameters["rope_type"] == "mrope":
-        assert "mrope_section" in rope_parameters
+        if "mrope_section" not in rope_parameters:
+            raise ValueError(
+                "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
+            )
        rope_parameters["rope_type"] = "default"
        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")

@@ -584,6 +605,7 @@ def get_config(
        trust_remote_code=trust_remote_code,
        revision=revision,
        code_revision=code_revision,
+        hf_overrides=hf_overrides_kw,
        **kwargs,
    )
    # Special architecture mapping check for GGUF models
@@ -915,11 +937,13 @@ def get_hf_text_config(config: PretrainedConfig):
    """
    text_config = config.get_text_config()

-    if text_config is not config:
-        # The code operates under the assumption that text_config should have
-        # `num_attention_heads` (among others). Assert here to fail early
-        # if transformers config doesn't align with this assumption.
-        assert hasattr(text_config, "num_attention_heads")
+    if text_config is not config and not hasattr(text_config, "num_attention_heads"):
+        raise ValueError(
+            "The text_config extracted from the model config does not have "
+            "`num_attention_heads` attribute. This indicates a mismatch "
+            "between the model config and vLLM's expectations. Please "
+            "ensure that the model config is compatible with vLLM."
+        )

    return text_config

@@ -930,6 +954,13 @@ def try_get_generation_config(
    revision: str | None = None,
    config_format: str | ConfigFormat = "auto",
 ) -> GenerationConfig | None:
+    # GGUF files don't have generation_config.json - their config is embedded
+    # in the file header. Skip all filesystem lookups to avoid re-reading the
+    # memory-mapped file, which can hang in multi-process scenarios when the
+    # EngineCore process already has the file mapped.
+    if is_gguf(model):
+        return None
+
    try:
        return GenerationConfig.from_pretrained(
            model,

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -10,46 +10,47 @@ Model configs may be defined in this directory for the following reasons:
  deepseek-ai/DeepSeek-V3.2-Exp.
 """

-from transformers import DeepseekV3Config
+from __future__ import annotations

-from vllm.transformers_utils.configs.afmoe import AfmoeConfig
-from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
-from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
-from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
-from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
-from vllm.transformers_utils.configs.eagle import EAGLEConfig
+import importlib

-# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
-# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
-# `FalconConfig` class from the official HuggingFace transformers library.
-from vllm.transformers_utils.configs.falcon import RWConfig
-from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
-from vllm.transformers_utils.configs.hunyuan_vl import (
-    HunYuanVLConfig,
-    HunYuanVLTextConfig,
-    HunYuanVLVisionConfig,
-)
-from vllm.transformers_utils.configs.jais import JAISConfig
-from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
-from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
-from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
-from vllm.transformers_utils.configs.medusa import MedusaConfig
-from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
-from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
-from vllm.transformers_utils.configs.moonvit import MoonViTConfig
-from vllm.transformers_utils.configs.nemotron import NemotronConfig
-from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
-from vllm.transformers_utils.configs.olmo3 import Olmo3Config
-from vllm.transformers_utils.configs.ovis import OvisConfig
-from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
-from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
-from vllm.transformers_utils.configs.step3_vl import (
-    Step3TextConfig,
-    Step3VisionEncoderConfig,
-    Step3VLConfig,
-)
-from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+_CLASS_TO_MODULE: dict[str, str] = {
+    "AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
+    "ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
+    "DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
+    "DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
+    "EAGLEConfig": "vllm.transformers_utils.configs.eagle",
+    "FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
+    "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+    # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+    # `FalconConfig` class from the official HuggingFace transformers library.
+    "RWConfig": "vllm.transformers_utils.configs.falcon",
+    "JAISConfig": "vllm.transformers_utils.configs.jais",
+    "Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
+    "MedusaConfig": "vllm.transformers_utils.configs.medusa",
+    "MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
+    "MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
+    "MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
+    "KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
+    "KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
+    "NemotronConfig": "vllm.transformers_utils.configs.nemotron",
+    "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
+    "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
+    "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "RadioConfig": "vllm.transformers_utils.configs.radio",
+    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
+    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
+    "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
+    "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
+    "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
+    # Special case: DeepseekV3Config is from HuggingFace Transformers
+    "DeepseekV3Config": "transformers",
+}

 __all__ = [
    "AfmoeConfig",
@@ -82,4 +83,18 @@ __all__ = [
    "Step3VisionEncoderConfig",
    "Step3TextConfig",
    "Qwen3NextConfig",
+    "Tarsier2Config",
 ]
+
+
+def __getattr__(name: str):
+    if name in _CLASS_TO_MODULE:
+        module_name = _CLASS_TO_MODULE[name]
+        module = importlib.import_module(module_name)
+        return getattr(module, name)
+
+    raise AttributeError(f"module 'configs' has no attribute '{name}'")
+
+
+def __dir__():
+    return sorted(list(__all__))
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -89,9 +89,14 @@ class NemotronConfig(PretrainedConfig):
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`dict`, *optional*):
-            The parameters of the RoPE embeddings.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
+            The parameters of the RoPE embeddings. Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear',
+                    'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
+                    original RoPE implementation.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
+                    Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output
            projection layers during self-attention.
@@ -133,7 +138,6 @@ class NemotronConfig(PretrainedConfig):
        eos_token_id=3,
        tie_word_embeddings=False,
        rope_parameters=None,
-        partial_rotary_factor=0.5,
        attention_bias=False,
        attention_dropout=0.0,
        mlp_bias=False,
@@ -165,14 +169,16 @@ class NemotronConfig(PretrainedConfig):
        rope_theta = kwargs.pop("rope_theta", 10000.0)
        if "rope_theta" not in rope_parameters:
            rope_parameters["rope_theta"] = rope_theta
-        self.rope_parameters = rope_parameters
        # for backward compatibility
        partial_rotary_factor = (
            kwargs.get("rope_percent")
            or kwargs.get("rope_percentage")
-            or partial_rotary_factor
+            or kwargs.get("partial_rotary_factor")
+            or 0.5
        )
-        self.partial_rotary_factor = partial_rotary_factor
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+        self.rope_parameters = rope_parameters
        self._rope_parameters_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout

--- a/vllm/transformers_utils/configs/nemotron_h.py
+++ b/vllm/transformers_utils/configs/nemotron_h.py
@@ -189,6 +189,7 @@ class NemotronHConfig(PretrainedConfig):
        n_shared_experts=1,
        moe_intermediate_size=7688,
        moe_shared_expert_intermediate_size=7688,
+        moe_latent_size=None,
        num_experts_per_tok=2,
        routed_scaling_factor=1.0,
        n_group=1,
@@ -254,6 +255,7 @@ class NemotronHConfig(PretrainedConfig):
        self.n_shared_experts = n_shared_experts
        self.moe_intermediate_size = moe_intermediate_size
        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size  # noqa: E501
+        self.moe_latent_size = moe_latent_size
        self.num_experts_per_tok = num_experts_per_tok
        self.routed_scaling_factor = routed_scaling_factor
        self.n_group = n_group

--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -103,8 +103,8 @@ class Qwen3NextConfig(PretrainedConfig):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
-            Percentage of the query and keys which will have rotary embedding.
+                `partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
+                    Percentage of the query and keys which will have rotary embedding.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -198,7 +198,6 @@ class Qwen3NextConfig(PretrainedConfig):
        use_cache=True,
        tie_word_embeddings=False,
        rope_parameters=None,
-        partial_rotary_factor=0.25,
        attention_bias=False,
        attention_dropout=0.0,
        head_dim=256,
@@ -239,6 +238,9 @@ class Qwen3NextConfig(PretrainedConfig):
        rope_theta = kwargs.pop("rope_theta", 10000.0)
        if "rope_theta" not in rope_parameters:
            rope_parameters["rope_theta"] = rope_theta
+        partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
+        if "partial_rotary_factor" not in rope_parameters:
+            rope_parameters["partial_rotary_factor"] = partial_rotary_factor
        self.rope_parameters = rope_parameters
        self.partial_rotary_factor = partial_rotary_factor
        self.attention_bias = attention_bias

--- a/vllm/transformers_utils/configs/tarsier2.py
+++ b/vllm/transformers_utils/configs/tarsier2.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import Qwen2VLConfig
+
+
+class Tarsier2Config(Qwen2VLConfig):
+    """
+    Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
+    a deeply nested config consisting of:
+
+    - LlavaConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+      - Qwen2VLConfig
+        - Qwen2VLTextConfig
+        - Qwen2VLVisionConfig
+
+    When it should really just be a single Qwen2VLConfig.
+
+    This class is a hack to stop AutoConfig from creating the nested config structure.
+    """
+
+    model_type = "tarsier2"
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -61,6 +61,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
        norm_init: float = 0.4,
        projector_act: str = "swiglu",
        projector_ln_mid: bool = False,
+        num_projector_layers: int = 0,
        **kwargs,
    ):
        self.ignore_index = ignore_index
@@ -71,6 +72,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
        self.norm_init = norm_init
        self.projector_act = projector_act
        self.projector_ln_mid = projector_ln_mid
+        self.num_projector_layers = num_projector_layers

        # N.B. May set the wrapped_model_config below.
        self.text_model_id = text_model_id