Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
......@@ -44,6 +44,10 @@ _REASONING_PARSERS_TO_REGISTER = {
"granite_reasoning_parser",
"GraniteReasoningParser",
),
"holo2": (
"holo2_reasoning_parser",
"Holo2ReasoningParser",
),
"hunyuan_a13b": (
"hunyuan_a13b_reasoning_parser",
"HunyuanA13BReasoningParser",
......
......@@ -63,6 +63,31 @@ class ReasoningParser:
True if the reasoning content ends in the input_ids.
"""
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
"""
Check if the reasoning content ends in the input_ids on a
decode step.
It is used in structured engines like `xgrammar` to check if the
reasoning content ends in the model output during a decode step.
`input_ids` the entire model output and `delta_ids` are the last few
computed tokens of the model output (like during a decode step).
Parameters:
input_ids: list[int]
The entire model output.
delta_ids: list[int]
The last few computed tokens of the model output at the current decode step.
Returns:
bool
True if the reasoning content ends in the `delta_ids` on a
decode step.
"""
return self.is_reasoning_end(input_ids)
@abstractmethod
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
......@@ -121,7 +146,7 @@ class ReasoningParser:
self,
original_tag: str | None,
tool_server: ToolServer | None,
) -> str:
) -> str | None:
"""
Instance method that is implemented for preparing the structured tag
Otherwise, None is returned
......@@ -160,7 +185,10 @@ class ReasoningParserManager:
if name in cls.lazy_parsers:
return cls._load_lazy_parser(name)
raise KeyError(f"Reasoning parser '{name}' not found.")
registered = ", ".join(cls.list_registered())
raise KeyError(
f"Reasoning parser '{name}' not found. Available parsers: {registered}"
)
@classmethod
def list_registered(cls) -> list[str]:
......
......@@ -64,8 +64,21 @@ class BaseThinkingReasoningParser(ReasoningParser):
)
def is_reasoning_end(self, input_ids: list[int]) -> bool:
start_token_id = self.start_token_id
end_token_id = self.end_token_id
return any(input_id == end_token_id for input_id in reversed(input_ids))
for i in range(len(input_ids) - 1, -1, -1):
if input_ids[i] == start_token_id:
return False
if input_ids[i] == end_token_id:
return True
return False
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
end_token_id = self.end_token_id
return end_token_id in delta_ids
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
......
......@@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser):
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
......
......@@ -5,7 +5,7 @@ from collections.abc import Sequence
from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.harmony_utils import parse_chat_output
from vllm.entrypoints.openai.parser.harmony_utils import parse_chat_output
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.entrypoints.tool_server import ToolServer
from vllm.logger import init_logger
......@@ -145,7 +145,7 @@ class GptOssReasoningParser(ReasoningParser):
# This function prepares the structural tag to format reasoning output
def prepare_structured_tag(
self, original_tag: str | None, tool_server: ToolServer | None
) -> str:
) -> str | None:
if original_tag is None:
if tool_server is None:
return json.dumps(no_func_reaonsing_tag)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import (
ReasoningParser,
)
from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from vllm.reasoning.identity_reasoning_parser import IdentityReasoningParser
from vllm.tokenizers import TokenizerLike
logger = init_logger(__name__)
class Holo2ReasoningParser(ReasoningParser):
"""
Reasoning parser for the Holo2 models which are based on Qwen3.
The Holo2 model uses <think>...</think> tokens to denote reasoning text but <think>
is part of the chat template. This parser extracts the reasoning content until
</think> in the model's output.
The model provides a switch to enable or disable reasoning
output via the 'thinking=False' parameter.
Chat template args:
- thinking: Whether to enable reasoning output (default: True)
Parsing rules on model output:
- thinking == False
-> Model output is treated as purely the content |content|
- thinking == True
-> Model output is |reasoning_content|</think>|content|
"""
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {}
# Deepseek V3 and Holo2 are similar. However, Holo2 models think by default.
# this parser without user specified chat template args is initiated once for
# all requests in the structured output manager. So it is important that without
# user specified chat template args, the default thinking is True.
enable_thinking = bool(chat_kwargs.get("thinking", True))
if enable_thinking:
self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
else:
self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return self._parser.is_reasoning_end(input_ids)
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return self._parser.is_reasoning_end_streaming(input_ids, delta_ids)
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return self._parser.extract_content_ids(input_ids)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
return self._parser.extract_reasoning(model_output, request)
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
return self._parser.extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
......@@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser):
# Always return True, since we never treat reasoning specially
return True
def is_reasoning_end_streaming(
self, input_ids: list[int], delta_ids: list[int]
) -> bool:
return True
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
# Identity: return all tokens as content
return input_ids
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .deepseekv32 import DeepseekV32Tokenizer
from .hf import HfTokenizer
from .mistral import MistralTokenizer
from .protocol import TokenizerLike
......@@ -21,4 +22,5 @@ __all__ = [
"get_tokenizer",
"cached_tokenizer_from_config",
"init_tokenizer_from_config",
"DeepseekV32Tokenizer",
]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
import copy
import json
from typing import Any
import regex as re
# flake8: noqa: E501
TOOLS_SYSTEM_TEMPLATE = """## Tools
You have access to a set of tools you can use to answer the user's question.
You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
<{dsml_token}function_calls>
<{dsml_token}invoke name="$FUNCTION_NAME">
<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
...
</{dsml_token}invoke>
<{dsml_token}invoke name="$FUNCTION_NAME2">
...
</{dsml_token}invoke>
</{dsml_token}function_calls>
String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
<{dsml_token}function_calls>
...
</{dsml_token}function_calls>
<function_results>
...
</function_results>
{thinking_start_token}...thinking about results{thinking_end_token}
Here are the functions available in JSONSchema format:
<functions>
{tool_schemas}
</functions>
"""
bos_token: str = "<|begin▁of▁sentence|>"
eos_token: str = "<|end▁of▁sentence|>"
thinking_start_token: str = "<think>"
thinking_end_token: str = "</think>"
dsml_token: str = "|DSML|"
system_msg_template: str = "{content}"
user_msg_template: str = "<|User|>{content}<|Assistant|>"
assistant_msg_template: str = "{reasoning}{content}{tool_calls}<|end▁of▁sentence|>"
thinking_template = "{reasoning_content}"
response_format_template: str = "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
tool_call_template: str = (
'<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
)
tool_calls_template = (
"<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
)
tool_output_template: str = "\n<result>{content}</result>"
def to_json(value: Any) -> str:
try:
return json.dumps(value, ensure_ascii=False)
except Exception:
return json.dumps(value, ensure_ascii=True)
def tools_from_openai_format(tools):
return [tool["function"] for tool in tools]
def tool_calls_from_openai_format(tool_calls):
return [
{
"name": tool_call["function"]["name"],
"arguments": tool_call["function"]["arguments"],
}
for tool_call in tool_calls
]
def tool_calls_to_openai_format(tool_calls):
return [
{
"type": "function",
"function": {
"name": tool_call["name"],
"arguments": tool_call["arguments"],
},
}
for tool_call in tool_calls
]
def encode_arguments_to_dsml(tool_call: dict[str, str]) -> str:
p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
P_dsml_strs = []
if isinstance(tool_call["arguments"], str):
arguments = json.loads(tool_call["arguments"])
else:
arguments = tool_call["arguments"]
for k, v in arguments.items():
p_dsml_str = p_dsml_template.format(
dsml_token=dsml_token,
key=k,
is_str="true" if isinstance(v, str) else "false",
value=v if isinstance(v, str) else to_json(v),
)
P_dsml_strs.append(p_dsml_str)
return "\n".join(P_dsml_strs)
def decode_dsml_to_arguments(
tool_name: str, tool_args: dict[str, tuple[str, str]]
) -> dict[str, str]:
def _decode_value(key: str, value: str, string: str):
if string == "true":
value = to_json(value)
return f"{to_json(key)}: {value}"
tool_args_json = (
"{"
+ ", ".join(
[_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
)
+ "}"
)
return dict(name=tool_name, arguments=tool_args_json)
def render_tools(tools: list[dict[str, str | dict[str, Any]]]) -> str:
tools_json = [to_json(t) for t in tools]
return TOOLS_SYSTEM_TEMPLATE.format(
tool_schemas="\n".join(tools_json),
dsml_token=dsml_token,
thinking_start_token=thinking_start_token,
thinking_end_token=thinking_end_token,
)
def find_last_user_index(messages: list[dict[str, Any]]) -> int:
last_user_index = -1
for idx in range(len(messages) - 1, -1, -1):
if messages[idx].get("role") in ["user", "developer"]:
last_user_index = idx
break
return last_user_index
def render_message(
index: int, messages: list[dict[str, Any]], thinking_mode: str
) -> str:
assert 0 <= index < len(messages)
assert thinking_mode in ["chat", "thinking"], (
f"Invalid thinking_mode `{thinking_mode}`"
)
prompt = ""
msg = messages[index]
last_user_idx = find_last_user_index(messages)
role = msg.get("role")
content = msg.get("content")
tools = msg.get("tools")
response_format = msg.get("response_format")
tool_calls = msg.get("tool_calls")
reasoning_content = msg.get("reasoning") or msg.get("reasoning_content")
if tools:
tools = tools_from_openai_format(tools)
if tool_calls:
tool_calls = tool_calls_from_openai_format(tool_calls)
if role == "system":
prompt += system_msg_template.format(content=content or "")
if tools:
prompt += "\n\n" + render_tools(tools)
if response_format:
prompt += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
elif role == "developer":
assert content, f"Invalid message for role `{role}`: {msg}"
content_developer = ""
if tools:
content_developer += "\n\n" + render_tools(tools)
if response_format:
content_developer += "\n\n" + response_format_template.format(
schema=to_json(response_format)
)
content_developer += "\n\n# The user's message is: {}".format(content)
prompt += user_msg_template.format(content=content_developer)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "user":
prompt += user_msg_template.format(content=content)
if index == last_user_idx and thinking_mode == "thinking":
prompt += thinking_start_token
else:
prompt += thinking_end_token
elif role == "tool":
prev_assistant_idx = index - 1
assistant_msg = messages[prev_assistant_idx]
while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
prev_assistant_idx -= 1
assistant_msg = messages[prev_assistant_idx]
assert (
index == 0
or prev_assistant_idx >= 0
and assistant_msg.get("role") == "assistant"
), f"Invalid messages at {index}:\n{assistant_msg}"
tool_call_order = index - prev_assistant_idx
assistant_tool_calls = assistant_msg.get("tool_calls")
assert assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order, (
"No tool calls but found tool output"
)
if tool_call_order == 1:
prompt += "\n\n<function_results>"
prompt += tool_output_template.format(content=content)
if tool_call_order == len(assistant_tool_calls):
prompt += "\n</function_results>"
if index >= last_user_idx and thinking_mode == "thinking":
prompt += "\n\n" + thinking_start_token
else:
prompt += "\n\n" + thinking_end_token
elif role == "assistant":
prev_assistant_idx = index
thinking_part = ""
tool_calls_content = ""
if tool_calls:
tool_calls = [
tool_call_template.format(
dsml_token=dsml_token,
name=tool_call.get("name"),
arguments=encode_arguments_to_dsml(tool_call),
)
for tool_call in tool_calls
]
tool_calls_content += "\n\n" + tool_calls_template.format(
dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
)
summary_content = content or ""
if thinking_mode == "thinking" and index > last_user_idx:
assert reasoning_content or tool_calls, (
f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
)
thinking_part = (
thinking_template.format(reasoning_content=reasoning_content or "")
+ thinking_end_token
)
prompt += assistant_msg_template.format(
reasoning=thinking_part,
content=summary_content,
tool_calls=tool_calls_content,
)
else:
raise NotImplementedError(f"Unknown role: {role}")
return prompt
def drop_thinking_messages(
messages: list[dict[str, Any]], last_user_idx: int | None = None
) -> list[dict[str, Any]]:
messages_wo_thinking: list[dict[str, Any]] = []
last_user_idx = (
find_last_user_index(messages) if last_user_idx is None else last_user_idx
)
for idx, msg in enumerate(messages):
role = msg.get("role")
if role in ["user", "system", "tool"] or idx >= last_user_idx:
messages_wo_thinking.append(msg)
continue
elif role == "assistant":
msg_wo_thinking = copy.copy(msg)
msg_wo_thinking.pop("reasoning_content", None)
msg_wo_thinking.pop("reasoning", None)
messages_wo_thinking.append(msg_wo_thinking)
return messages_wo_thinking
def encode_messages(
messages: list[dict[str, Any]],
thinking_mode: str,
context: list[dict[str, Any]] | None = None,
drop_thinking: bool = True,
add_default_bos_token: bool = True,
) -> str:
context = context if context else []
full_messages = context + messages
prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
if thinking_mode == "thinking" and drop_thinking:
full_messages = drop_thinking_messages(full_messages)
for idx in range(len(messages)):
prompt += render_message(
idx + len(context), full_messages, thinking_mode=thinking_mode
)
return prompt
def _read_until_stop(
index: int, text: str, stop: list[str]
) -> tuple[int, str, None | str]:
min_pos = len(text)
matched_stop = None
for s in stop:
pos = text.find(s, index)
if pos != -1 and pos < min_pos:
min_pos = pos
matched_stop = s
if matched_stop:
content = text[index:min_pos]
return min_pos + len(matched_stop), content, matched_stop
else:
content = text[index:]
return len(text), content, None
def parse_tool_calls(index: int, text: str):
tool_calls: list[dict[str, Any]] = []
stop_token = None
tool_calls_end_token = f"</{dsml_token}function_calls>"
while index < len(text):
index, _, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
)
assert _ == ">\n", "Tool call format error"
if stop_token == tool_calls_end_token:
break
assert stop_token is not None, "Missing special token"
index, tool_name_content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
p_tool_name = re.findall(
r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
)
assert len(p_tool_name) == 1, "Tool name format error"
tool_name = p_tool_name[0]
tool_args: dict[str, tuple[str, str]] = {}
while stop_token == f"<{dsml_token}parameter":
index, param_content, stop_token = _read_until_stop(
index, text, [f"/{dsml_token}parameter"]
)
param_kv = re.findall(
r'^ name="(.*?)" string="(true|false)">(.*?)<$',
param_content,
flags=re.DOTALL,
)
assert len(param_kv) == 1, "Parameter format error"
param_name, string, param_value = param_kv[0]
assert param_name not in tool_args, "Duplicate parameter name"
tool_args[param_name] = (param_value, string)
index, content, stop_token = _read_until_stop(
index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
)
assert content == ">\n", "Parameter format error"
tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
tool_calls.append(tool_call)
return index, stop_token, tool_calls
# NOTE: This function is designed to parse only correctly
# formatted string and will not attempt to correct malformed output
# that may be generated by the model.
def parse_message_from_completion_text(text: str, thinking_mode: str):
summary_content, reasoning_content, tool_calls = "", "", []
index, stop_token = 0, None
tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
is_thinking, is_tool_calling = thinking_mode == "thinking", False
if is_thinking:
index, content_delta, stop_token = _read_until_stop(
index, text, [thinking_end_token, tool_calls_start_token]
)
reasoning_content = content_delta
assert stop_token == thinking_end_token, "Invalid thinking format"
index, content_delta, stop_token = _read_until_stop(
index, text, [eos_token, tool_calls_start_token]
)
summary_content = content_delta
if stop_token == tool_calls_start_token:
is_tool_calling = True
else:
assert stop_token == eos_token, "Invalid summary format"
if is_tool_calling:
index, stop_token, tool_calls = parse_tool_calls(index, text)
index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
assert not tool_ends_text, "Unexpected content after tool calls"
assert len(text) == index and stop_token in [eos_token, None], (
"Unexpected content at end"
)
for sp_token in [
bos_token,
eos_token,
thinking_start_token,
thinking_end_token,
dsml_token,
]:
assert sp_token not in summary_content and sp_token not in reasoning_content, (
"Unexpected special token in content"
)
return {
"role": "assistant",
"content": summary_content,
"reasoning_content": reasoning_content,
"reasoning": reasoning_content,
"tool_calls": tool_calls_to_openai_format(tool_calls),
}
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
from transformers import BatchEncoding
from .deepseek_v32_encoding import encode_messages
from .hf import HfTokenizer, TokenizerLike
from .registry import TokenizerRegistry
@TokenizerRegistry.register("deepseek_v32")
class DeepseekV32Tokenizer(HfTokenizer):
def __init__(self, tokenizer: TokenizerLike):
self.tokenizer = tokenizer
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
tokenizer = super().from_pretrained(
path_or_repo_id,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
return DeepseekV32Tokenizer(tokenizer)
def apply_chat_template(self, messages, tools=None, **kwargs):
thinking = kwargs.get("thinking", False)
thinking_mode = "thinking"
if not thinking:
thinking_mode = "chat"
conversation = kwargs.get("conversation", messages)
messages = conversation.copy()
drop_thinking = True
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools
drop_thinking = False
encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
prompt_str = encode_messages(messages, **encode_config) # type: ignore
return prompt_str
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
@property
def all_special_tokens(self) -> list[str]:
return self.tokenizer.all_special_tokens
@property
def all_special_ids(self) -> list[int]:
return self.tokenizer.all_special_ids
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_token_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_token_id
@property
def pad_token_id(self) -> int:
return self.tokenizer.pad_token_id
@property
def is_fast(self) -> bool:
return self.tokenizer.is_fast
@property
def vocab_size(self) -> int:
return self.tokenizer.vocab_size
@property
def max_token_id(self) -> int:
return self.tokenizer.max_token_id
@property
def truncation_side(self) -> str:
return self.tokenizer.truncation_side
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab())
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
return self.tokenizer(
text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
def get_vocab(self) -> dict[str, int]:
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab()
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
return self.tokenizer.encode(
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
def convert_tokens_to_string(self, tokens: list[str]) -> str:
return self.tokenizer.convert_tokens_to_string(tokens)
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
return self.tokenizer.convert_ids_to_tokens(
ids, skip_special_tokens=skip_special_tokens
)
......@@ -14,13 +14,19 @@ if TYPE_CHECKING:
)
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from transformers import BatchEncoding
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as TransformersMistralTokenizer,
)
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
logger = init_logger(__name__)
......@@ -208,11 +214,17 @@ class MistralTokenizer(TokenizerLike):
**kwargs,
) -> "MistralTokenizer":
from mistral_common.protocol.instruct.validator import ValidationMode
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as TransformersMistralTokenizer,
)
tokenizer = TransformersMistralTokenizer.from_pretrained(
try:
# Transformers v5
from transformers.tokenization_mistral_common import MistralCommonBackend
except ImportError:
# Transformers v4
from transformers.tokenization_mistral_common import (
MistralCommonTokenizer as MistralCommonBackend,
)
tokenizer = MistralCommonBackend.from_pretrained(
path_or_repo_id,
*args,
mode=ValidationMode.test,
......@@ -223,7 +235,7 @@ class MistralTokenizer(TokenizerLike):
return cls(tokenizer)
def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
def __init__(self, tokenizer: "MistralCommonBackend") -> None:
super().__init__()
from mistral_common.protocol.instruct.validator import ValidationMode
......@@ -297,6 +309,9 @@ class MistralTokenizer(TokenizerLike):
for i in all_special_ids
]
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
# the following attributes are set to fit vLLM's design and are used
# by the structured output backends.
@property
......@@ -409,6 +424,7 @@ class MistralTokenizer(TokenizerLike):
) -> list[int]:
add_generation_prompt = kwargs.pop("add_generation_prompt", False)
continue_final_message = kwargs.get("continue_final_message", False)
tokenize = kwargs.get("tokenize", True)
padding = kwargs.get("padding", False)
truncation = kwargs.get("truncation", False)
max_length = kwargs.get("max_length")
......@@ -421,7 +437,7 @@ class MistralTokenizer(TokenizerLike):
conversation=messages,
tools=tools,
continue_final_message=continue_final_message,
tokenize=True,
tokenize=tokenize,
padding=padding,
truncation=truncation,
max_length=max_length,
......
......@@ -22,6 +22,9 @@ class TokenizerLike(Protocol):
) -> "TokenizerLike":
raise NotImplementedError
def num_special_tokens_to_add(self) -> int:
raise NotImplementedError
@property
def all_special_tokens(self) -> list[str]:
raise NotImplementedError
......
......@@ -11,14 +11,14 @@ from typing_extensions import assert_never
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.transformers_utils.utils import (
from vllm.transformers_utils.gguf_utils import (
check_gguf_file,
get_gguf_file_path_from_hf,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from vllm.transformers_utils.repo_utils import list_filtered_repo_files
from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
......@@ -183,7 +183,7 @@ def get_tokenizer(
"`tokenizer_mode='custom'` when initializing vLLM.",
tokenizer_args,
str(tokenizer_kwargs),
tokenizer_mode,
tokenizer_name,
)
tokenizer_mode = str(tokenizer_name)
......
......@@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
from vllm import envs
from vllm.logger import init_logger
from vllm.transformers_utils.utils import parse_safetensors_file_metadata
from .config_parser_base import ConfigParserBase
from .gguf_utils import (
check_gguf_file,
is_gguf,
is_remote_gguf,
split_remote_gguf,
)
from .repo_utils import (
_get_hf_token,
file_or_path_exists,
......@@ -36,13 +43,6 @@ from .repo_utils import (
try_get_local_file,
with_retry,
)
from .utils import (
check_gguf_file,
is_gguf,
is_remote_gguf,
parse_safetensors_file_metadata,
split_remote_gguf,
)
if envs.VLLM_USE_MODELSCOPE:
from modelscope import AutoConfig
......@@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
step3_text="Step3TextConfig",
qwen3_next="Qwen3NextConfig",
lfm2_moe="Lfm2MoeConfig",
tarsier2="Tarsier2Config",
)
_CONFIG_ATTRS_MAPPING: dict[str, str] = {
......@@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase):
if config_dict.get("speculators_config") is not None
else model_type
)
# Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY
if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None:
model_type = hf_overrides.get("model_type", model_type)
if model_type in _CONFIG_REGISTRY:
config_class = _CONFIG_REGISTRY[model_type]
......@@ -300,17 +304,31 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
def patch_rope_parameters(config: PretrainedConfig) -> None:
"""Provide backwards compatibility for RoPE."""
from vllm.config.utils import getattr_iter
rope_theta_names = ("rope_theta", "rotary_emb_base")
rope_theta = getattr_iter(config, rope_theta_names, None)
if Version(version("transformers")) < Version("5.0.0.dev0"):
# Transformers v4 installed, legacy config fields may be present
if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
config.rope_parameters = rope_scaling
if (rope_theta := getattr(config, "rope_theta", None)) is not None:
if rope_theta is not None:
if not hasattr(config, "rope_parameters"):
config.rope_parameters = {"rope_type": "default"}
config.rope_parameters["rope_theta"] = rope_theta
partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct")
partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None)
if partial_rotary_factor is not None:
if not hasattr(config, "rope_parameters"):
config.rope_parameters = {"rope_type": "default"}
config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
elif rope_theta is not None or hasattr(config, "rope_parameters"):
# Transformers v5 installed
config.standardize_rope_params()
config.validate_rope()
# No RoPE parameters to patch
if not hasattr(config, "rope_parameters"):
if getattr(config, "rope_parameters", None) is None:
return
# Add original_max_position_embeddings if present
......@@ -351,7 +369,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
rope_parameters["rope_type"] = "longrope"
logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
elif rope_parameters["rope_type"] == "mrope":
assert "mrope_section" in rope_parameters
if "mrope_section" not in rope_parameters:
raise ValueError(
"Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
)
rope_parameters["rope_type"] = "default"
logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
......@@ -584,6 +605,7 @@ def get_config(
trust_remote_code=trust_remote_code,
revision=revision,
code_revision=code_revision,
hf_overrides=hf_overrides_kw,
**kwargs,
)
# Special architecture mapping check for GGUF models
......@@ -915,11 +937,13 @@ def get_hf_text_config(config: PretrainedConfig):
"""
text_config = config.get_text_config()
if text_config is not config:
# The code operates under the assumption that text_config should have
# `num_attention_heads` (among others). Assert here to fail early
# if transformers config doesn't align with this assumption.
assert hasattr(text_config, "num_attention_heads")
if text_config is not config and not hasattr(text_config, "num_attention_heads"):
raise ValueError(
"The text_config extracted from the model config does not have "
"`num_attention_heads` attribute. This indicates a mismatch "
"between the model config and vLLM's expectations. Please "
"ensure that the model config is compatible with vLLM."
)
return text_config
......@@ -930,6 +954,13 @@ def try_get_generation_config(
revision: str | None = None,
config_format: str | ConfigFormat = "auto",
) -> GenerationConfig | None:
# GGUF files don't have generation_config.json - their config is embedded
# in the file header. Skip all filesystem lookups to avoid re-reading the
# memory-mapped file, which can hang in multi-process scenarios when the
# EngineCore process already has the file mapped.
if is_gguf(model):
return None
try:
return GenerationConfig.from_pretrained(
model,
......
......@@ -10,46 +10,47 @@ Model configs may be defined in this directory for the following reasons:
deepseek-ai/DeepSeek-V3.2-Exp.
"""
from transformers import DeepseekV3Config
from __future__ import annotations
from vllm.transformers_utils.configs.afmoe import AfmoeConfig
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig
import importlib
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
from vllm.transformers_utils.configs.hunyuan_vl import (
HunYuanVLConfig,
HunYuanVLTextConfig,
HunYuanVLVisionConfig,
)
from vllm.transformers_utils.configs.jais import JAISConfig
from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig
from vllm.transformers_utils.configs.medusa import MedusaConfig
from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
from vllm.transformers_utils.configs.nemotron import NemotronConfig
from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
from vllm.transformers_utils.configs.olmo3 import Olmo3Config
from vllm.transformers_utils.configs.ovis import OvisConfig
from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
from vllm.transformers_utils.configs.step3_vl import (
Step3TextConfig,
Step3VisionEncoderConfig,
Step3VLConfig,
)
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
_CLASS_TO_MODULE: dict[str, str] = {
"AfmoeConfig": "vllm.transformers_utils.configs.afmoe",
"ChatGLMConfig": "vllm.transformers_utils.configs.chatglm",
"DeepseekVLV2Config": "vllm.transformers_utils.configs.deepseek_vl2",
"DotsOCRConfig": "vllm.transformers_utils.configs.dotsocr",
"EAGLEConfig": "vllm.transformers_utils.configs.eagle",
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
"RWConfig": "vllm.transformers_utils.configs.falcon",
"JAISConfig": "vllm.transformers_utils.configs.jais",
"Lfm2MoeConfig": "vllm.transformers_utils.configs.lfm2_moe",
"MedusaConfig": "vllm.transformers_utils.configs.medusa",
"MiDashengLMConfig": "vllm.transformers_utils.configs.midashenglm",
"MLPSpeculatorConfig": "vllm.transformers_utils.configs.mlp_speculator",
"MoonViTConfig": "vllm.transformers_utils.configs.moonvit",
"KimiLinearConfig": "vllm.transformers_utils.configs.kimi_linear",
"KimiVLConfig": "vllm.transformers_utils.configs.kimi_vl",
"NemotronConfig": "vllm.transformers_utils.configs.nemotron",
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
"Olmo3Config": "vllm.transformers_utils.configs.olmo3",
"OvisConfig": "vllm.transformers_utils.configs.ovis",
"RadioConfig": "vllm.transformers_utils.configs.radio",
"SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
"UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
"Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
"Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
"Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
# Special case: DeepseekV3Config is from HuggingFace Transformers
"DeepseekV3Config": "transformers",
}
__all__ = [
"AfmoeConfig",
......@@ -82,4 +83,18 @@ __all__ = [
"Step3VisionEncoderConfig",
"Step3TextConfig",
"Qwen3NextConfig",
"Tarsier2Config",
]
def __getattr__(name: str):
if name in _CLASS_TO_MODULE:
module_name = _CLASS_TO_MODULE[name]
module = importlib.import_module(module_name)
return getattr(module, name)
raise AttributeError(f"module 'configs' has no attribute '{name}'")
def __dir__():
return sorted(list(__all__))
......@@ -89,9 +89,14 @@ class NemotronConfig(PretrainedConfig):
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_parameters (`dict`, *optional*):
The parameters of the RoPE embeddings.
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding.
The parameters of the RoPE embeddings. Expected contents:
`rope_theta` (`float`): The base period of the RoPE embeddings.
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear',
'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the
original RoPE implementation.
`partial_rotary_factor` (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output
projection layers during self-attention.
......@@ -133,7 +138,6 @@ class NemotronConfig(PretrainedConfig):
eos_token_id=3,
tie_word_embeddings=False,
rope_parameters=None,
partial_rotary_factor=0.5,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
......@@ -165,14 +169,16 @@ class NemotronConfig(PretrainedConfig):
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
self.rope_parameters = rope_parameters
# for backward compatibility
partial_rotary_factor = (
kwargs.get("rope_percent")
or kwargs.get("rope_percentage")
or partial_rotary_factor
or kwargs.get("partial_rotary_factor")
or 0.5
)
self.partial_rotary_factor = partial_rotary_factor
if "partial_rotary_factor" not in rope_parameters:
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
self.rope_parameters = rope_parameters
self._rope_parameters_validation()
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
......
......@@ -189,6 +189,7 @@ class NemotronHConfig(PretrainedConfig):
n_shared_experts=1,
moe_intermediate_size=7688,
moe_shared_expert_intermediate_size=7688,
moe_latent_size=None,
num_experts_per_tok=2,
routed_scaling_factor=1.0,
n_group=1,
......@@ -254,6 +255,7 @@ class NemotronHConfig(PretrainedConfig):
self.n_shared_experts = n_shared_experts
self.moe_intermediate_size = moe_intermediate_size
self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size # noqa: E501
self.moe_latent_size = moe_latent_size
self.num_experts_per_tok = num_experts_per_tok
self.routed_scaling_factor = routed_scaling_factor
self.n_group = n_group
......
......@@ -103,8 +103,8 @@ class Qwen3NextConfig(PretrainedConfig):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
partial_rotary_factor (`float`, *optional*, defaults to 0.25):
Percentage of the query and keys which will have rotary embedding.
`partial_rotary_factor` (`float`, *optional*, defaults to 0.25):
Percentage of the query and keys which will have rotary embedding.
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
......@@ -198,7 +198,6 @@ class Qwen3NextConfig(PretrainedConfig):
use_cache=True,
tie_word_embeddings=False,
rope_parameters=None,
partial_rotary_factor=0.25,
attention_bias=False,
attention_dropout=0.0,
head_dim=256,
......@@ -239,6 +238,9 @@ class Qwen3NextConfig(PretrainedConfig):
rope_theta = kwargs.pop("rope_theta", 10000.0)
if "rope_theta" not in rope_parameters:
rope_parameters["rope_theta"] = rope_theta
partial_rotary_factor = kwargs.pop("partial_rotary_factor", 0.25)
if "partial_rotary_factor" not in rope_parameters:
rope_parameters["partial_rotary_factor"] = partial_rotary_factor
self.rope_parameters = rope_parameters
self.partial_rotary_factor = partial_rotary_factor
self.attention_bias = attention_bias
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers import Qwen2VLConfig
class Tarsier2Config(Qwen2VLConfig):
"""
Tarsier2's config.json is written such that AutoConfig.from_pretrained will create
a deeply nested config consisting of:
- LlavaConfig
- Qwen2VLConfig
- Qwen2VLTextConfig
- Qwen2VLVisionConfig
- Qwen2VLConfig
- Qwen2VLTextConfig
- Qwen2VLVisionConfig
When it should really just be a single Qwen2VLConfig.
This class is a hack to stop AutoConfig from creating the nested config structure.
"""
model_type = "tarsier2"
......@@ -61,6 +61,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
norm_init: float = 0.4,
projector_act: str = "swiglu",
projector_ln_mid: bool = False,
num_projector_layers: int = 0,
**kwargs,
):
self.ignore_index = ignore_index
......@@ -71,6 +72,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
self.norm_init = norm_init
self.projector_act = projector_act
self.projector_ln_mid = projector_ln_mid
self.num_projector_layers = num_projector_layers
# N.B. May set the wrapped_model_config below.
self.text_model_id = text_model_id
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment