Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from importlib.metadata import version
import pytest
import regex as re
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.logprobs import SampleLogprobs
from vllm.multimodal.image import rescale_image_size
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptImageInput,
VllmRunner,
)
from ....utils import multi_gpu_test
from ...utils import check_logprobs_close
pytestmark = pytest.mark.skipif(
Version("5.0") <= Version(version("transformers")),
reason=(
"vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
"internals (filter_out_non_signature_kwargs) removed by "
"huggingface/transformers#43514"
),
)
MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<image>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<image>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<image>\n<image>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
DTYPE = "half"
MAX_TOKENS = 128
NUM_LOGPROBS = 10
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<image>)+", "", output_str)
if output_str_without_image and output_str_without_image[0] == " ":
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
hf_output_ids = tokenizer.encode(output_str_without_image)
if hf_output_ids and hf_output_ids[0] == tokenizer.bos_token_id:
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
def _build_single_image_inputs(
image_assets,
) -> list[tuple[list[str], PromptImageInput]]:
"""Build single-image inputs for all size_factors at once."""
images = [asset.pil_image for asset in image_assets]
all_inputs: list[tuple[list[str], PromptImageInput]] = []
for size_factors in [[1.0], [0.25, 0.5, 1.0]]:
for image, prompt in zip(images, HF_IMAGE_PROMPTS):
all_inputs.append(
(
[prompt for _ in size_factors],
[rescale_image_size(image, f) for f in size_factors],
)
)
return all_inputs
def _build_multi_image_inputs(
image_assets,
) -> list[tuple[list[str], PromptImageInput]]:
"""Build multi-image inputs for all size_factors at once."""
images = [asset.pil_image for asset in image_assets]
all_inputs: list[tuple[list[str], PromptImageInput]] = []
for size_factors in [[0.5], [0.15, 0.30]]:
all_inputs.append(
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
)
)
return all_inputs
def _run_and_compare(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
all_inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
max_model_len: int,
max_num_seqs: int,
mm_limit: int,
gpu_memory_utilization: float,
):
"""Load each runner once, run all inputs, then compare."""
# NOTE: run vLLM first, then HF. vLLM needs a fresh process without
# cuda initialization; running HF first would break the multiprocessing
# backend with fork method.
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
gpu_memory_utilization=gpu_memory_utilization,
dtype=DTYPE,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=2,
trust_remote_code=True,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
MAX_TOKENS,
num_logprobs=NUM_LOGPROBS,
images=images,
)
for prompts, images in all_inputs
]
hf_model_kwargs = {"_attn_implementation": "sdpa", "device_map": "auto"}
with hf_runner(
model,
dtype=DTYPE,
model_kwargs=hf_model_kwargs,
auto_cls=AutoModelForCausalLM,
trust_remote_code=True,
) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
MAX_TOKENS,
num_logprobs=NUM_LOGPROBS,
images=images,
)
for prompts, images in all_inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [MODEL_ID])
def test_models(hf_runner, vllm_runner, image_assets, model) -> None:
all_inputs = _build_single_image_inputs(image_assets)
_run_and_compare(
hf_runner,
vllm_runner,
all_inputs,
model,
max_model_len=8192,
max_num_seqs=2,
mm_limit=1,
gpu_memory_utilization=0.80,
)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [MODEL_ID])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model) -> None:
all_inputs = _build_multi_image_inputs(image_assets)
_run_and_compare(
hf_runner,
vllm_runner,
all_inputs,
model,
max_model_len=8192,
max_num_seqs=2,
mm_limit=2,
gpu_memory_utilization=0.80,
)
...@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets): ...@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
) )
@pytest.mark.skip(
reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
"doesn't resolve chat_template=None to the default template"
)
def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets): def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
"""Compare vLLM Mistral-format output against HF Transformers reference. """Compare vLLM Mistral-format output against HF Transformers reference.
......
...@@ -80,6 +80,11 @@ def run_test( ...@@ -80,6 +80,11 @@ def run_test(
if vllm_runner_kwargs: if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs) vllm_runner_kwargs_.update(vllm_runner_kwargs)
# Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
# already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
if "limit_mm_per_prompt" in vllm_runner_kwargs_:
limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
with vllm_runner( with vllm_runner(
model, model,
max_model_len=max_model_len, max_model_len=max_model_len,
......
...@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam ...@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from ....conftest import VllmRunner from ....conftest import VllmRunner
pytestmark = pytest.mark.skip(
reason="ColQwen3 model's weight tying is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = [ MODELS = [
"TomoroAI/tomoro-colqwen3-embed-4b", "TomoroAI/tomoro-colqwen3-embed-4b",
"OpenSearch-AI/Ops-Colqwen3-4B", "OpenSearch-AI/Ops-Colqwen3-4B",
......
...@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE ...@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import ImageTestAssets from ....conftest import ImageTestAssets
pytestmark = pytest.mark.skip(
reason="InternVisionModel's custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
# we use snapshot_download to prevent conflicts between # we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner # dynamic_module and trust_remote_code for hf_runner
DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
......
...@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam ...@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
pytestmark = pytest.mark.skip(
reason="jinaai/jina-reranker-m0 custom code is incompatible with "
"transformers v5 (missing all_tied_weights_keys)"
)
MODELS = ["jinaai/jina-reranker-m0"] MODELS = ["jinaai/jina-reranker-m0"]
MM_PROCESSOR_KWARGS = { MM_PROCESSOR_KWARGS = {
......
...@@ -17,11 +17,13 @@ ...@@ -17,11 +17,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from importlib.metadata import version
from unittest.mock import MagicMock from unittest.mock import MagicMock
import numpy as np import numpy as np
import pytest import pytest
import torch import torch
from packaging.version import Version
from transformers import PretrainedConfig from transformers import PretrainedConfig
from tests.models.registry import HF_EXAMPLE_MODELS from tests.models.registry import HF_EXAMPLE_MODELS
...@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx): ...@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>" assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
@pytest.mark.skipif(
Version(version("transformers")) >= Version("5.5"),
reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
"with a different get_audio_features signature (requires input_ids)",
)
def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config(): def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
from transformers.models.musicflamingo import ( from transformers.models.musicflamingo import (
modeling_musicflamingo as hf_musicflamingo_modeling, modeling_musicflamingo as hf_musicflamingo_modeling,
......
...@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"internlm/internlm2-chat-7b", trust_remote_code=True "internlm/internlm2-chat-7b", trust_remote_code=True
), ),
"InternLM2VEForCausalLM": _HfExamplesInfo( "InternLM2VEForCausalLM": _HfExamplesInfo(
"OpenGVLab/Mono-InternVL-2B", trust_remote_code=True "OpenGVLab/Mono-InternVL-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `vision_config` is not always set"
)
},
), ),
"InternLM3ForCausalLM": _HfExamplesInfo( "InternLM3ForCausalLM": _HfExamplesInfo(
"internlm/internlm3-8b-instruct", trust_remote_code=True "internlm/internlm3-8b-instruct", trust_remote_code=True
...@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Plamo2ForCausalLM": _HfExamplesInfo( "Plamo2ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-2-1b", "pfnet/plamo-2-1b",
trust_remote_code=True, trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": (
"Custom model code uses `_tied_weight_keys: list[str]` but "
"Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
)
},
), ),
"Plamo3ForCausalLM": _HfExamplesInfo( "Plamo3ForCausalLM": _HfExamplesInfo(
"pfnet/plamo-3-nict-2b-base", "pfnet/plamo-3-nict-2b-base",
...@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
is_available_online=True, is_available_online=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where "
"validate_rope() no longer accepts ignore_keys param"
)
},
), ),
"SeedOssForCausalLM": _HfExamplesInfo( "SeedOssForCausalLM": _HfExamplesInfo(
"ByteDance-Seed/Seed-OSS-36B-Instruct", "ByteDance-Seed/Seed-OSS-36B-Instruct",
...@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"xverse/XVERSE-7B-Chat", "xverse/XVERSE-7B-Chat",
tokenizer="meta-llama/Llama-2-7b", tokenizer="meta-llama/Llama-2-7b",
trust_remote_code=True, trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "XVERSE tokenizer is incompatible with transformers v5 "
"(add_prefix_space / prepend_scheme mismatch).",
},
), ),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
"MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
...@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# [Decoder-only] # [Decoder-only]
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo( "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
"nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0" "nvidia/audio-flamingo-3-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
), ),
"MusicFlamingoForConditionalGeneration": _HfExamplesInfo( "MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
"nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0" "nvidia/music-flamingo-2601-hf",
min_transformers_version="5.3.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
), ),
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
"BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"), "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
...@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
), ),
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo( "FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
"allendou/FireRedASR2-LLM-vllm", "allendou/FireRedASR2-LLM-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
),
"FireRedLIDForConditionalGeneration": _HfExamplesInfo(
"PatchyTisa/FireRedLID-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
), ),
"FunASRForConditionalGeneration": _HfExamplesInfo( "FunASRForConditionalGeneration": _HfExamplesInfo(
"allendou/Fun-ASR-Nano-2512-vllm", "allendou/Fun-ASR-Nano-2512-vllm",
trust_remote_code=True,
max_transformers_version="5.1",
transformers_version_reason={
"vllm": "Incompatible with transformers v5.2+ "
"(dict object has no attribute '__name__').",
},
), ),
"FunAudioChatForConditionalGeneration": _HfExamplesInfo( "FunAudioChatForConditionalGeneration": _HfExamplesInfo(
"funaudiochat", is_available_online=False "funaudiochat", is_available_online=False
...@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"HCXVisionForCausalLM": _HfExamplesInfo( "HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True, trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"Custom config cannot be loaded with Transformers "
"v5 because `text_config` is not always set"
)
},
), ),
"HCXVisionV2ForCausalLM": _HfExamplesInfo( "HCXVisionV2ForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B", "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
...@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"}, extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
), ),
"InternS1ForConditionalGeneration": _HfExamplesInfo( "InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True "internlm/Intern-S1",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom tokenizer code is not compatible with Transformers v5."
},
), ),
"InternS1ProForConditionalGeneration": _HfExamplesInfo( "InternS1ProForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1-Pro", "internlm/Intern-S1-Pro",
...@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiDashengLMModel": _HfExamplesInfo( "MiDashengLMModel": _HfExamplesInfo(
"mispeech/midashenglm-7b", trust_remote_code=True "mispeech/midashenglm-7b", trust_remote_code=True
), ),
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True), "MiniCPMO": _HfExamplesInfo(
"openbmb/MiniCPM-o-2_6",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"hf": "Custom processor code is not compatible with Transformers v5."
},
),
"MiniCPMV": _HfExamplesInfo( "MiniCPMV": _HfExamplesInfo(
"openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-Llama3-V-2_5",
extras={ extras={
...@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"4.0": "openbmb/MiniCPM-V-4", "4.0": "openbmb/MiniCPM-V-4",
"4.5": "openbmb/MiniCPM-V-4_5", "4.5": "openbmb/MiniCPM-V-4_5",
}, },
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"MiniCPMVBatchFeature is incompatible with its base class in "
"Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
)
},
trust_remote_code=True, trust_remote_code=True,
), ),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo( "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
...@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"nano_vl_dummy", is_available_online=False, trust_remote_code=True "nano_vl_dummy", is_available_online=False, trust_remote_code=True
), ),
"OpenCUAForConditionalGeneration": _HfExamplesInfo( "OpenCUAForConditionalGeneration": _HfExamplesInfo(
"xlangai/OpenCUA-7B", trust_remote_code=True "xlangai/OpenCUA-7B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Tokenizer cannot be initialised in Transformers v5."
},
), ),
"OpenPanguVLForConditionalGeneration": _HfExamplesInfo( "OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
"FreedomIntelligence/openPangu-VL-7B", "FreedomIntelligence/openPangu-VL-7B",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
enforce_eager=True, enforce_eager=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": (
"OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
"making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
)
},
), ),
"Ovis": _HfExamplesInfo( "Ovis": _HfExamplesInfo(
"AIDC-AI/Ovis2-1B", "AIDC-AI/Ovis2-1B",
...@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
}, },
), ),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True), "Ovis2_5": _HfExamplesInfo(
"AIDC-AI/Ovis2.5-2B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
),
"Ovis2_6ForCausalLM": _HfExamplesInfo( "Ovis2_6ForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True "AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
), ),
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo( "Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True "AIDC-AI/Ovis2.6-30B-A3B",
trust_remote_code=True,
max_transformers_version="4.57",
transformers_version_reason={
"vllm": "Custom processor code is not compatible with Transformers v5."
},
), ),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo( "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL", "PaddlePaddle/PaddleOCR-VL",
...@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
}, # noqa: E501 }, # noqa: E501
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}, extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
), ),
"Phi4ForCausalLMV": _HfExamplesInfo(
"microsoft/Phi-4-reasoning-vision-15B",
trust_remote_code=True,
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"vllm upgraded transformers above v5.4 where HF model "
"custom code uses siglip2 internals "
"(filter_out_non_signature_kwargs) removed "
"by huggingface/transformers#43514"
)
},
),
"Phi4MMForCausalLM": _HfExamplesInfo( "Phi4MMForCausalLM": _HfExamplesInfo(
"microsoft/Phi-4-multimodal-instruct", trust_remote_code=True "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
), ),
...@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"architectures": ["Tarsier2ForConditionalGeneration"], "architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2", "model_type": "tarsier2",
}, },
max_transformers_version="5.3",
transformers_version_reason={
"vllm": (
"Qwen2VLConfig was split into Qwen2VLConfig + "
"Qwen2VLTextConfig in transformers v5, breaking "
"attribute access (num_attention_heads, hidden_size, etc.)"
)
},
), ),
"VoxtralForConditionalGeneration": _HfExamplesInfo( "VoxtralForConditionalGeneration": _HfExamplesInfo(
"mistralai/Voxtral-Mini-3B-2507", "mistralai/Voxtral-Mini-3B-2507",
......
...@@ -375,6 +375,7 @@ def softmax(data): ...@@ -375,6 +375,7 @@ def softmax(data):
@dataclass @dataclass
class ModelInfo: class ModelInfo:
name: str name: str
revision: str | None = None
architecture: str = "" architecture: str = ""
dtype: str = "auto" dtype: str = "auto"
max_model_len: int | None = None max_model_len: int | None = None
...@@ -468,7 +469,16 @@ def dummy_hf_overrides( ...@@ -468,7 +469,16 @@ def dummy_hf_overrides(
else: else:
# Use minimal layers for testing # Use minimal layers for testing
num_layers = 1 num_layers = 1
num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1 num_hidden_layers = (
3
if model_arch
in (
"Gemma3nForConditionalGeneration",
"Gemma4ForCausalLM",
"Gemma4ForConditionalGeneration",
)
else 1
)
update_dict = { update_dict = {
"num_layers": num_layers, "num_layers": num_layers,
......
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
import pytest import pytest
from tests.reasoning.utils import run_reasoning_extraction from tests.reasoning.utils import run_reasoning_extraction
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
# Using mistral tokenizer as a generic mock since the actual model is not on HF # Using mistral tokenizer as a generic mock since the actual model is not on HF
...@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = { ...@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
"is_reasoning_end": True, "is_reasoning_end": True,
} }
THOUGHT_PREFIX = {
"output": "<|channel>thought\nActual reasoning here<channel|>Final answer",
"reasoning": "Actual reasoning here",
"content": "Final answer",
"is_reasoning_end": True,
}
THOUGHT_PREFIX_ONLY = {
"output": "<|channel>thought\n<channel|>",
"reasoning": "",
"content": None,
"is_reasoning_end": True,
}
THOUGHT_PREFIX_MULTILINE = {
"output": "<|channel>thought\nLine1\nLine2<channel|>Answer",
"reasoning": "Line1\nLine2",
"content": "Answer",
"is_reasoning_end": True,
}
# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
THOUGHT_PREFIX_DIVERGE = {
"output": "<|channel>thousand reasons<channel|>Done",
"reasoning": "thousand reasons",
"content": "Done",
"is_reasoning_end": True,
}
# The model isn't reasoning if we're generating tool calls.
TOOL_CALL_STARTED = {
"output": "<|tool_call>",
"reasoning": None,
"content": "<|tool_call>",
"is_reasoning_end": True,
}
TEST_CASES = [ TEST_CASES = [
pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"), pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"), pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"),
...@@ -120,17 +156,22 @@ TEST_CASES = [ ...@@ -120,17 +156,22 @@ TEST_CASES = [
pytest.param(False, EMPTY, id="empty"), pytest.param(False, EMPTY, id="empty"),
pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"), pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"),
pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"), pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"),
pytest.param(False, THOUGHT_PREFIX, id="thought_prefix"),
pytest.param(True, THOUGHT_PREFIX, id="thought_prefix_streaming"),
pytest.param(False, THOUGHT_PREFIX_ONLY, id="thought_prefix_only"),
pytest.param(True, THOUGHT_PREFIX_ONLY, id="thought_prefix_only_streaming"),
pytest.param(False, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline"),
pytest.param(
True, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline_streaming"
),
pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
] ]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
def test_gemma4_reasoning(
streaming: bool,
param_dict: dict,
generic_tokenizer,
):
output = param_dict["output"]
# Resolve token IDs dynamically from the real tokenizer # Resolve token IDs dynamically from the real tokenizer
vocab = generic_tokenizer.get_vocab() vocab = generic_tokenizer.get_vocab()
start_token_id = vocab["<|channel>"] start_token_id = vocab["<|channel>"]
...@@ -176,6 +217,18 @@ def test_gemma4_reasoning( ...@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
else: else:
output_tokens += _encode(output) output_tokens += _encode(output)
return output_tokens
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_gemma4_reasoning(
streaming: bool,
param_dict: dict,
generic_tokenizer,
):
output = param_dict["output"]
output_tokens = gemma4_encode_output(generic_tokenizer, output)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer generic_tokenizer
) )
...@@ -194,3 +247,29 @@ def test_gemma4_reasoning( ...@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
# Test is_reasoning_end # Test is_reasoning_end
is_reasoning_end = parser.is_reasoning_end(output_tokens) is_reasoning_end = parser.is_reasoning_end(output_tokens)
assert is_reasoning_end == param_dict["is_reasoning_end"] assert is_reasoning_end == param_dict["is_reasoning_end"]
def test_gemma4_adjust_request(generic_tokenizer):
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer
)
request = ChatCompletionRequest(messages=[], model="test-model")
assert request.skip_special_tokens is True
result = parser.adjust_request(request)
assert result.skip_special_tokens is False
assert result is request
def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
output = (
"<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
"<|turn>user\nThanks<|turn>model\n"
)
output_tokens = gemma4_encode_output(generic_tokenizer, output)
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
generic_tokenizer
)
is_reasoning_end = parser.is_reasoning_end(output_tokens)
assert not is_reasoning_end
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.tokenizers import get_tokenizer
parser_name = "step3p5" parser_name = "step3p5"
start_token = "<think>" start_token = "<think>"
...@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash" ...@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def step3p5_tokenizer(): def step3p5_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
SIMPLE_REASONING = { SIMPLE_REASONING = {
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Gemma4 chat template rendering."""
from pathlib import Path
import jinja2.sandbox
import pytest
TEMPLATE_PATH = (
Path(__file__).resolve().parent.parent.parent
/ "examples"
/ "tool_chat_template_gemma4.jinja"
)
@pytest.fixture(scope="module")
def gemma4_template():
"""Load and compile the Gemma4 chat template."""
template_str = TEMPLATE_PATH.read_text()
env = jinja2.sandbox.ImmutableSandboxedEnvironment()
return env.from_string(template_str)
def _render(template, messages, **kwargs):
"""Render the template with sensible defaults."""
kwargs.setdefault("bos_token", "<bos>")
kwargs.setdefault("add_generation_prompt", False)
return template.render(messages=messages, **kwargs)
class TestGemma4ChatTemplate:
def test_basic_multiturn_thinking_disabled(self, gemma4_template):
"""With enable_thinking=False (default), generation prompt ends with
an empty thought channel to suppress thinking."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "<|turn>user\n" in result
assert "<|turn>model\n" in result
assert "Hello" in result
assert "Hi there!" in result
assert "How are you?" in result
assert result.rstrip("\n").endswith("<|channel>thought\n<channel|>")
def test_basic_multiturn_thinking_enabled(self, gemma4_template):
"""With enable_thinking=True, generation prompt ends with model
turn opener (no thought suppression)."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "How are you?"},
]
result = _render(
gemma4_template,
messages,
add_generation_prompt=True,
enable_thinking=True,
)
assert "<|turn>user\n" in result
assert "<|turn>model\n" in result
assert "Hello" in result
assert "Hi there!" in result
assert "How are you?" in result
assert result.rstrip("\n").endswith("<|turn>model")
def test_system_message(self, gemma4_template):
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hi"},
]
result = _render(gemma4_template, messages)
assert "<|turn>system\n" in result
assert "You are helpful." in result
def test_thinking_enabled(self, gemma4_template):
messages = [{"role": "user", "content": "Think about this"}]
result = _render(
gemma4_template,
messages,
add_generation_prompt=True,
enable_thinking=True,
)
assert "<|think|>" in result
assert "<|turn>system\n" in result
def test_tool_declarations(self, gemma4_template):
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "City name",
}
},
"required": ["city"],
},
},
}
]
messages = [{"role": "user", "content": "What is the weather?"}]
result = _render(
gemma4_template,
messages,
tools=tools,
add_generation_prompt=True,
)
assert "<|tool>" in result
assert "declaration:get_weather" in result
assert "<tool|>" in result
assert '<|"|>City name<|"|>' in result
def test_tool_calls_in_assistant(self, gemma4_template):
messages = [
{"role": "user", "content": "Weather in London?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|tool_call>call:get_weather{" in result
assert "}<tool_call|>" in result
assert '<|"|>London<|"|>' in result
def test_tool_responses_openai_style(self, gemma4_template):
"""role='tool' messages are formatted as <|tool_response> blocks
with content dumped as-is."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": '{"temperature": 15, "condition": "sunny"}',
},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "<|tool_response>" in result
assert "response:get_weather{" in result
assert "<tool_response|>" in result
assert '"temperature": 15' in result
def test_tool_responses_legacy_style(self, gemma4_template):
"""tool_responses embedded on the assistant message."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
"tool_responses": [
{
"name": "get_weather",
"response": {"temperature": 20},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|tool_response>" in result
assert "response:get_weather{" in result
assert "temperature:" in result
def test_generation_prompt_not_after_tool_response(self, gemma4_template):
"""add_generation_prompt=True should NOT add <|turn>model when the
last message type was tool_response (the model turn continues)."""
messages = [
{"role": "user", "content": "Weather?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "get_weather",
"arguments": {"city": "London"},
},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": "sunny",
},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert not result.strip().endswith("<|turn>model\n")
def test_reasoning_in_tool_chains(self, gemma4_template):
"""reasoning field on assistant with tool_calls after last user
message emits <|channel>thought\\n...<channel|>."""
messages = [
{"role": "user", "content": "Calculate something"},
{
"role": "assistant",
"content": "",
"reasoning": "Let me think about this...",
"tool_calls": [
{
"function": {
"name": "calculator",
"arguments": {"expr": "2+2"},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert "<|channel>thought\n" in result
assert "Let me think about this..." in result
assert "<channel|>" in result
def test_reasoning_not_before_last_user(self, gemma4_template):
"""reasoning on assistant BEFORE the last user message is dropped."""
messages = [
{"role": "user", "content": "First"},
{
"role": "assistant",
"content": "Response",
"reasoning": "Old reasoning that should be dropped",
"tool_calls": [
{
"function": {
"name": "fn",
"arguments": {},
},
}
],
},
{"role": "user", "content": "Second"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert "Old reasoning" not in result
def test_strip_thinking_in_model_content(self, gemma4_template):
"""<|channel>...<channel|> in model content is stripped by the
strip_thinking macro."""
messages = [
{"role": "user", "content": "Hi"},
{
"role": "assistant",
"content": ("<|channel>internal thought<channel|>Visible answer"),
},
]
result = _render(gemma4_template, messages)
assert "internal thought" not in result
assert "Visible answer" in result
def test_multi_turn_tool_chain(self, gemma4_template):
"""assistant->tool->assistant->tool produces exactly one
<|turn>model (later assistants continue the same turn)."""
messages = [
{"role": "user", "content": "Do two things"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c1",
"function": {"name": "step1", "arguments": {}},
},
],
},
{"role": "tool", "tool_call_id": "c1", "content": "result1"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "c2",
"function": {"name": "step2", "arguments": {}},
},
],
},
{"role": "tool", "tool_call_id": "c2", "content": "result2"},
]
result = _render(gemma4_template, messages, add_generation_prompt=True)
assert result.count("<|turn>model\n") == 1
def test_format_argument_types(self, gemma4_template):
"""Strings wrapped in <|"|>, booleans as true/false, numbers bare."""
messages = [
{"role": "user", "content": "Test"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"function": {
"name": "test_fn",
"arguments": {
"name": "Alice",
"active": True,
"count": 42,
},
},
}
],
},
]
result = _render(gemma4_template, messages)
assert '<|"|>Alice<|"|>' in result
assert "active:true" in result
assert "count:42" in result
...@@ -85,6 +85,14 @@ class TestParseGemma4Args: ...@@ -85,6 +85,14 @@ class TestParseGemma4Args:
result = _parse_gemma4_args("flag:false") result = _parse_gemma4_args("flag:false")
assert result == {"flag": False} assert result == {"flag": False}
def test_null_value(self):
# Bare `null` must parse as None (Python), not the string "null".
# Without this, tool_choice=auto would emit `{"param": "null"}`
# instead of `{"param": null}` for nullable tool parameters.
result = _parse_gemma4_args("param:null")
assert result == {"param": None}
assert json.dumps(result) == '{"param": null}'
def test_mixed_types(self): def test_mixed_types(self):
result = _parse_gemma4_args( result = _parse_gemma4_args(
'name:<|"|>test<|"|>,count:42,active:true,score:3.14' 'name:<|"|>test<|"|>,count:42,active:true,score:3.14'
...@@ -114,6 +122,19 @@ class TestParseGemma4Args: ...@@ -114,6 +122,19 @@ class TestParseGemma4Args:
result = _parse_gemma4_args("key:") result = _parse_gemma4_args("key:")
assert result == {"key": ""} assert result == {"key": ""}
def test_empty_value_partial_withheld(self):
"""Key with no value is withheld in partial mode to avoid premature emission."""
result = _parse_gemma4_args("key:", partial=True)
assert result == {}
# also with a space after the colon
result = _parse_gemma4_args("key: ", partial=True)
assert result == {}
def test_empty_value_after_other_keys_partial_withheld(self):
"""Trailing key with no value is withheld; earlier keys are kept."""
result = _parse_gemma4_args('name:<|"|>test<|"|>,flag:', partial=True)
assert result == {"name": "test"}
class TestParseGemma4Array: class TestParseGemma4Array:
def test_string_array(self): def test_string_array(self):
...@@ -491,6 +512,51 @@ class TestStreamingExtraction: ...@@ -491,6 +512,51 @@ class TestStreamingExtraction:
assert parsed_args["count"] == 42 assert parsed_args["count"] == 42
assert parsed_args["active"] is True assert parsed_args["active"] is True
def test_streaming_boolean_split_across_chunks(self, parser, mock_request):
"""Boolean value split across token boundaries must not corrupt JSON."""
chunks = [
"<|tool_call>",
"call:search{input:{all:" + "true"[:3],
"e}}",
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
parsed_args = json.loads(args_text)
assert parsed_args["input"]["all"] is True
def test_streaming_false_split_across_chunks(self, parser, mock_request):
"""Boolean false split across chunks."""
chunks = [
"<|tool_call>",
"call:set{flag:" + "false"[:4],
"e}",
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
parsed_args = json.loads(args_text)
assert parsed_args["flag"] is False
def test_streaming_number_split_across_chunks(self, parser, mock_request):
"""Number split across chunks must not change type."""
chunks = [
"<|tool_call>",
"call:set{count:4",
"2}",
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
parsed_args = json.loads(args_text)
assert parsed_args["count"] == 42
def test_streaming_empty_args(self, parser, mock_request): def test_streaming_empty_args(self, parser, mock_request):
"""Tool call with no arguments.""" """Tool call with no arguments."""
chunks = [ chunks = [
...@@ -502,3 +568,119 @@ class TestStreamingExtraction: ...@@ -502,3 +568,119 @@ class TestStreamingExtraction:
results = self._simulate_streaming(parser, mock_request, chunks) results = self._simulate_streaming(parser, mock_request, chunks)
name = self._collect_function_name(results) name = self._collect_function_name(results)
assert name == "get_status" assert name == "get_status"
def test_streaming_split_delimiter_no_invalid_json(self, parser, mock_request):
"""Partial <|"|> delimiter chars must not leak into streamed JSON.
Reproduces the bug from https://github.com/vllm-project/vllm/issues/38946
where a token boundary splits the string delimiter, leaving fragments
like '<|' at the end of a parsed value which then corrupt the JSON.
"""
chunks = [
"<|tool_call>",
"call:todowrite{",
'content:<|"|>Buy milk<|',
'"|>}',
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
# Must be valid JSON — the original bug caused a JSON parse error
parsed_args = json.loads(args_text)
assert parsed_args["content"] == "Buy milk"
# Ensure no raw delimiter fragments leaked into the JSON
assert "<|" not in args_text, (
f"Partial delimiter leaked into JSON: {args_text!r}"
)
def test_streaming_does_not_duplicate_plain_text_after_tool_call(
self, parser, mock_request, monkeypatch
):
"""Buffered plain text after a tool call must not corrupt current_text."""
captured_current_texts: list[str] = []
original_extract_streaming = parser._extract_streaming
def wrapped_extract_streaming(previous_text, current_text, delta_text):
captured_current_texts.append(current_text)
return original_extract_streaming(previous_text, current_text, delta_text)
monkeypatch.setattr(parser, "_extract_streaming", wrapped_extract_streaming)
chunks = [
"<|tool_call>",
"call:get_weather{",
'location:<|"|>Paris<|"|>}',
"<tool_call|><",
"div>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
content_parts = [
delta.content for delta, _ in results if delta is not None and delta.content
]
assert "".join(content_parts) == "<div>"
assert captured_current_texts[-1].endswith("<tool_call|><div>")
assert not captured_current_texts[-1].endswith("<tool_call|><<div>")
def test_streaming_html_argument_does_not_duplicate_tag_prefixes(
self, parser, mock_request
):
"""HTML content inside tool arguments must not be duplicated."""
chunks = [
"<|tool_call>",
"call:write_file{",
'path:<|"|>index.html<|"|>,',
'content:<|"|><!DOCTYPE html>\n<',
'html lang="zh-CN">\n<',
"head>\n <",
'meta charset="UTF-8">\n <',
'meta name="viewport" content="width=device-width">\n',
'<|"|>}',
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text
parsed_args = json.loads(args_text)
assert parsed_args["path"] == "index.html"
assert (
parsed_args["content"] == "<!DOCTYPE html>\n"
'<html lang="zh-CN">\n'
"<head>\n"
' <meta charset="UTF-8">\n'
' <meta name="viewport" content="width=device-width">\n'
)
def test_streaming_trailing_bare_bool_not_duplicated(self, parser, mock_request):
"""Trailing bare boolean must not be streamed twice."""
chunks = [
"<|tool_call>",
"call:Edit{",
'file_path:<|"|>src/env.py<|"|>,',
'old_string:<|"|>old_val<|"|>,',
'new_string:<|"|>new_val<|"|>,',
"replace_all:",
"false}",
"<tool_call|>",
]
results = self._simulate_streaming(parser, mock_request, chunks)
args_text = self._collect_arguments(results)
assert args_text, "No arguments were streamed"
parsed_args = json.loads(args_text)
assert parsed_args == {
"file_path": "src/env.py",
"old_string": "old_val",
"new_string": "new_val",
"replace_all": False,
}
assert args_text.count("replace_all") == 1
...@@ -542,12 +542,16 @@ def test_eagle_correctness_light( ...@@ -542,12 +542,16 @@ def test_eagle_correctness_light(
"auto", "auto",
0.8, 0.8,
), ),
( pytest.param(
("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
False, False,
False, False,
"transformers", "transformers",
0.8, 0.8,
# TODO(hmellor): figure out why memory usage is so high
marks=pytest.mark.skip(
reason="Feature is experimental and uses too much memory in CI",
),
), ),
pytest.param( pytest.param(
( (
......
...@@ -3397,3 +3397,38 @@ if hasattr(torch.ops._C, "hadacore_transform"): ...@@ -3397,3 +3397,38 @@ if hasattr(torch.ops._C, "hadacore_transform"):
@register_fake("_C::hadacore_transform") @register_fake("_C::hadacore_transform")
def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor: def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor:
return torch.empty_like(x) if not inplace else x return torch.empty_like(x) if not inplace else x
if hasattr(torch.ops._C, "minimax_allreduce_rms"):
@register_fake("_C::minimax_allreduce_rms")
def _minimax_allreduce_rms_fake(
input: torch.Tensor,
norm_weight: torch.Tensor,
workspace: torch.Tensor,
rank: int,
nranks: int,
eps: float,
) -> torch.Tensor:
return torch.empty_like(input)
if hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
@register_fake("_C::minimax_allreduce_rms_qk")
def _minimax_allreduce_rms_qk_fake(
qkv: torch.Tensor,
norm_weight_q: torch.Tensor,
norm_weight_k: torch.Tensor,
workspace: torch.Tensor,
q_size: int,
kv_size: int,
rank: int,
nranks: int,
eps: float,
) -> tuple[torch.Tensor, torch.Tensor]:
token_num = qkv.shape[0]
return (
torch.empty([token_num, q_size], dtype=qkv.dtype, device=qkv.device),
torch.empty([token_num, kv_size], dtype=qkv.dtype, device=qkv.device),
)
...@@ -205,6 +205,8 @@ def support_torch_compile( ...@@ -205,6 +205,8 @@ def support_torch_compile(
if v.annotation in [ if v.annotation in [
torch.Tensor, torch.Tensor,
torch.Tensor | None, torch.Tensor | None,
torch.FloatTensor,
torch.FloatTensor | None,
IntermediateTensors, IntermediateTensors,
IntermediateTensors | None, IntermediateTensors | None,
]: ]:
...@@ -346,7 +348,7 @@ def _support_torch_compile( ...@@ -346,7 +348,7 @@ def _support_torch_compile(
def __init__( def __init__(
self: _T, self: _T,
*, *args,
vllm_config: VllmConfig | None = None, vllm_config: VllmConfig | None = None,
prefix: str = "", prefix: str = "",
**kwargs: Any, **kwargs: Any,
...@@ -357,11 +359,24 @@ def _support_torch_compile( ...@@ -357,11 +359,24 @@ def _support_torch_compile(
# NOTE: to support multimodal models (such as encoder), # NOTE: to support multimodal models (such as encoder),
# we may not have vllm_config so we may need to patch it # we may not have vllm_config so we may need to patch it
sig = inspect.signature(old_init) sig = inspect.signature(old_init)
# Check that any positional arguments match the old_init method signature
annotations = [p.annotation for p in sig.parameters.values()]
for arg, annotation in zip(args, annotations):
if annotation is inspect._empty:
continue
if not isinstance(arg, annotation):
init = f"'{type(self).__name__}.__init__'"
arg_type = f"'{type(arg).__name__}'"
raise TypeError(
f"{init} received a positional argument of type {arg_type}, "
"but no parameter of that type was found in the method signature. "
f"Please either annotate {init} or pass it as a keyword argument."
)
if "vllm_config" in sig.parameters: if "vllm_config" in sig.parameters:
kwargs["vllm_config"] = vllm_config kwargs["vllm_config"] = vllm_config
if "prefix" in sig.parameters: if "prefix" in sig.parameters:
kwargs["prefix"] = prefix kwargs["prefix"] = prefix
old_init(self, **kwargs) old_init(self, *args, **kwargs)
self.vllm_config = vllm_config self.vllm_config = vllm_config
self.compilation_config = self.vllm_config.compilation_config self.compilation_config = self.vllm_config.compilation_config
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Fusion pass: replace MiniMax QK allreduce + RMS norm with the Lamport
fused kernel (minimax_allreduce_rms_qk) for decode-size batches.
Pattern (inlined forward_qk in compiled graph):
q, k, v = qkv.split([q_size, kv_size, kv_size], -1)
q_fp32 = q.to(float32); k_fp32 = k.to(float32)
q_var = q_fp32.pow(2).mean(-1, keepdim=True)
k_var = k_fp32.pow(2).mean(-1, keepdim=True)
qk_var = cat([q_var, k_var], -1)
qk_var = allreduce(qk_var) / tp_world
q_var, k_var = qk_var.chunk(2, -1)
q_out = (q_fp32 * rsqrt(q_var + eps) * q_weight).to(orig_dtype)
k_out = (k_fp32 * rsqrt(k_var + eps) * k_weight).to(orig_dtype)
return q_out, k_out, v
Replacement (pure, no in-place on qkv/q/k):
q_out, k_out = minimax_qk_norm_fused(qkv, q_weight, k_weight, workspace, ...)
v = qkv.split([q_size, kv_size, kv_size], -1)[2]
return q_out, k_out, v
is_applicable_for_range: only fires for compile_range.end <= max_decode_tokens
so that large prefill batches fall through to the original forward_qk (= main).
"""
import torch
import torch._inductor.pattern_matcher as pm
import torch.fx as fx
from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.config import VllmConfig
from vllm.config.utils import Range
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm.distributed.parallel_state import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from vllm.logger import init_logger
from vllm.utils.torch_utils import direct_register_custom_op
from ..inductor_pass import enable_fake_mode
from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
logger = init_logger(__name__)
MAX_TOKEN_NUM = 2048
_MINIMAX_QK_NORM_FUSED_OP = None
if hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
def _minimax_qk_norm_fused(
qkv: torch.Tensor,
norm_weight_q: torch.Tensor,
norm_weight_k: torch.Tensor,
q_size: int,
kv_size: int,
rank: int,
nranks: int,
eps: float,
max_tokens: int,
) -> tuple[torch.Tensor, torch.Tensor]:
from vllm.distributed.parallel_state import get_tp_group
from vllm.model_executor.layers.mamba.lamport_workspace import (
get_allreduce_workspace,
)
workspace = get_allreduce_workspace(
rank=rank,
world_size=nranks,
max_tokens=max_tokens,
process_group=get_tp_group().cpu_group,
)
return torch.ops._C.minimax_allreduce_rms_qk(
qkv,
norm_weight_q,
norm_weight_k,
workspace,
q_size,
kv_size,
rank,
nranks,
eps,
)
def _minimax_qk_norm_fused_fake(
qkv: torch.Tensor,
norm_weight_q: torch.Tensor,
norm_weight_k: torch.Tensor,
q_size: int,
kv_size: int,
rank: int,
nranks: int,
eps: float,
max_tokens: int,
) -> tuple[torch.Tensor, torch.Tensor]:
T = qkv.shape[0]
return (
torch.empty([T, q_size], dtype=qkv.dtype, device=qkv.device),
torch.empty([T, kv_size], dtype=qkv.dtype, device=qkv.device),
)
direct_register_custom_op(
op_name="minimax_qk_norm_fused",
op_func=_minimax_qk_norm_fused,
fake_impl=_minimax_qk_norm_fused_fake,
mutates_args=[],
)
_MINIMAX_QK_NORM_FUSED_OP = torch.ops.vllm.minimax_qk_norm_fused.default
class MiniMaxQKNormPattern:
"""
Match the forward_qk allreduce+rms pattern and replace with Lamport kernel.
"""
def __init__(
self,
q_size: int,
kv_size: int,
eps: float,
tp_world: int,
tp_rank: int,
max_tokens: int,
dtype: torch.dtype,
device: str | None,
) -> None:
self.q_size = q_size
self.kv_size = kv_size
self.eps = eps
self.tp_world = tp_world
self.tp_rank = tp_rank
self.max_tokens = max_tokens
self.dtype = dtype
self.device = device
def get_inputs(self) -> list[torch.Tensor]:
T = 4
qkv = torch.empty(
[T, self.q_size + 2 * self.kv_size],
device=self.device,
dtype=self.dtype,
)
q_weight = torch.empty([self.q_size], device=self.device, dtype=self.dtype)
k_weight = torch.empty([self.kv_size], device=self.device, dtype=self.dtype)
return [qkv, q_weight, k_weight]
def register(self, pm_pass: PatternMatcherPass) -> None:
q_size = self.q_size
kv_size = self.kv_size
eps = self.eps
tp_world = self.tp_world
max_tokens = self.max_tokens
tp_rank = self.tp_rank
dtype = self.dtype
def pattern(
qkv: torch.Tensor,
q_weight: torch.Tensor,
k_weight: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
q_fp32 = q.to(torch.float32)
k_fp32 = k.to(torch.float32)
q_var = q_fp32.pow(2).mean(dim=-1, keepdim=True)
k_var = k_fp32.pow(2).mean(dim=-1, keepdim=True)
qk_var = torch.cat([q_var, k_var], dim=-1)
qk_var = tensor_model_parallel_all_reduce(qk_var) / tp_world
q_var, k_var = qk_var.chunk(2, dim=-1)
q_out = (q_fp32 * torch.rsqrt(q_var + eps) * q_weight).to(dtype)
k_out = (k_fp32 * torch.rsqrt(k_var + eps) * k_weight).to(dtype)
return q_out, k_out, v
def replacement(
qkv: torch.Tensor,
q_weight: torch.Tensor,
k_weight: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
assert _MINIMAX_QK_NORM_FUSED_OP is not None
q_out, k_out = torch.ops.vllm.minimax_qk_norm_fused(
qkv,
q_weight,
k_weight,
q_size,
kv_size,
tp_rank,
tp_world,
eps,
max_tokens,
)
_, _, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
return q_out, k_out, v
pm.register_replacement(
pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
)
# Second pattern: three separate split_with_sizes nodes (one per output),
# each with _users=1. This occurs when the QKV projection uses a
# functional GEMM kernel (e.g. cutlass_scaled_mm via auto_functionalized),
# which causes inductor to generate one split per consumer.
def pattern_split3(
qkv: torch.Tensor,
q_weight: torch.Tensor,
k_weight: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
q = qkv.split([q_size, kv_size, kv_size], dim=-1)[0]
k = qkv.split([q_size, kv_size, kv_size], dim=-1)[1]
v = qkv.split([q_size, kv_size, kv_size], dim=-1)[2]
q_fp32 = q.to(torch.float32)
k_fp32 = k.to(torch.float32)
q_var = q_fp32.pow(2).mean(dim=-1, keepdim=True)
k_var = k_fp32.pow(2).mean(dim=-1, keepdim=True)
qk_var = torch.cat([q_var, k_var], dim=-1)
qk_var = tensor_model_parallel_all_reduce(qk_var) / tp_world
q_var, k_var = qk_var.chunk(2, dim=-1)
q_out = (q_fp32 * torch.rsqrt(q_var + eps) * q_weight).to(dtype)
k_out = (k_fp32 * torch.rsqrt(k_var + eps) * k_weight).to(dtype)
return q_out, k_out, v
pm.register_replacement(
pattern_split3, replacement, self.get_inputs(), pm.fwd_only, pm_pass
)
class MiniMaxQKNormPass(VllmPatternMatcherPass):
"""
Replace forward_qk allreduce+norm with the Lamport fused kernel.
Only applied for decode-size compile ranges (small token counts).
"""
def __init__(self, config: VllmConfig) -> None:
super().__init__(config)
self.disabled = True
if _MINIMAX_QK_NORM_FUSED_OP is None:
logger.warning_once(
"minimax_allreduce_rms_qk op not found, MiniMaxQKNormPass disabled."
)
return
tp_world = get_tensor_model_parallel_world_size()
if tp_world <= 1:
logger.warning_once("MiniMaxQKNormPass disabled: tp_size <= 1.")
return
if config.model_config is None:
logger.warning_once("MiniMaxQKNormPass disabled: no model_config.")
return
hf_cfg = config.model_config.hf_config
model_name = getattr(hf_cfg, "architectures", "")[0]
if model_name != "MiniMaxM2ForCausalLM":
return
num_attention_heads = getattr(hf_cfg, "num_attention_heads", 0)
num_key_value_heads = getattr(hf_cfg, "num_key_value_heads", 0)
hidden_size = getattr(hf_cfg, "hidden_size", 0)
head_dim = getattr(hf_cfg, "head_dim", 0)
eps: float = getattr(hf_cfg, "rms_norm_eps", 1e-6)
if (
num_attention_heads != 48
or num_key_value_heads != 8
or hidden_size != 3072
or head_dim != 128
):
logger.warning_once(
"MiniMaxQKNormPass disabled: cannot infer model info from hf_config."
)
return
num_heads_per_rank = num_attention_heads // tp_world
num_kv_heads_per_rank = max(1, num_key_value_heads // tp_world)
q_size = num_heads_per_rank * head_dim
kv_size = num_kv_heads_per_rank * head_dim
self.max_token_num = min(
MAX_TOKEN_NUM, config.scheduler_config.max_num_batched_tokens
)
tp_rank = get_tensor_model_parallel_rank()
# Allocate Lamport workspace first.
from vllm.distributed.parallel_state import get_tp_group
from vllm.model_executor.layers.mamba.lamport_workspace import (
get_allreduce_workspace,
)
get_allreduce_workspace(
rank=tp_rank,
world_size=tp_world,
max_tokens=self.max_token_num,
process_group=get_tp_group().cpu_group,
)
self.patterns: PatternMatcherPass = PatternMatcherPass(
pass_name="minimax_qk_norm_pass"
)
self._register_patterns(q_size, kv_size, eps, tp_world, tp_rank)
self.dump_patterns(config, self.patterns)
self.disabled = False
@enable_fake_mode
def _register_patterns(
self,
q_size: int,
kv_size: int,
eps: float,
tp_world: int,
tp_rank: int,
) -> None:
MiniMaxQKNormPattern(
q_size=q_size,
kv_size=kv_size,
eps=eps,
tp_world=tp_world,
tp_rank=tp_rank,
max_tokens=self.max_token_num,
dtype=self.model_dtype,
device=self.device,
).register(self.patterns)
def is_applicable_for_range(self, compile_range: Range) -> bool:
if self.disabled:
return False
return bool(compile_range.end <= self.max_token_num)
@VllmInductorPass.time_and_log
def __call__(self, graph: fx.Graph) -> None:
if self.disabled:
return
self.matched_count = self.patterns.apply(graph)
logger.debug("MiniMaxQKNormPass replaced %s patterns", self.matched_count)
def uuid(self) -> str:
return VllmInductorPass.hash_source(self, MiniMaxQKNormPattern)
...@@ -36,6 +36,7 @@ if current_platform.is_cuda_alike(): ...@@ -36,6 +36,7 @@ if current_platform.is_cuda_alike():
if current_platform.is_cuda(): if current_platform.is_cuda():
from .fusion.allreduce_rms_fusion import AllReduceFusionPass from .fusion.allreduce_rms_fusion import AllReduceFusionPass
from .fusion.collective_fusion import AsyncTPPass from .fusion.collective_fusion import AsyncTPPass
from .fusion.minimax_qk_norm_fusion import MiniMaxQKNormPass
from .inductor_pass import ( from .inductor_pass import (
CustomGraphPass, CustomGraphPass,
...@@ -124,6 +125,9 @@ class PostGradPassManager(CustomGraphPass): # type: ignore[misc] ...@@ -124,6 +125,9 @@ class PostGradPassManager(CustomGraphPass): # type: ignore[misc]
if self.pass_config.fuse_allreduce_rms: if self.pass_config.fuse_allreduce_rms:
self.passes += [AllReduceFusionPass(config)] self.passes += [AllReduceFusionPass(config)]
if self.pass_config.fuse_minimax_qk_norm:
self.passes += [MiniMaxQKNormPass(config)]
if self.pass_config.fuse_norm_quant: if self.pass_config.fuse_norm_quant:
self.passes += [RMSNormQuantFusionPass(config)] self.passes += [RMSNormQuantFusionPass(config)]
if rocm_aiter_ops.is_enabled(): if rocm_aiter_ops.is_enabled():
......
...@@ -132,6 +132,8 @@ class PassConfig: ...@@ -132,6 +132,8 @@ class PassConfig:
"""Enable async TP.""" """Enable async TP."""
fuse_allreduce_rms: bool = None # type: ignore[assignment] fuse_allreduce_rms: bool = None # type: ignore[assignment]
"""Enable flashinfer allreduce fusion.""" """Enable flashinfer allreduce fusion."""
fuse_minimax_qk_norm: bool = None # type: ignore[assignment]
"""Enable fused allreduce+RMSNorm for MiniMax QK norm."""
enable_qk_norm_rope_fusion: bool = False enable_qk_norm_rope_fusion: bool = False
"""Enable fused Q/K RMSNorm + RoPE pass.""" """Enable fused Q/K RMSNorm + RoPE pass."""
...@@ -282,7 +284,7 @@ class PassConfig: ...@@ -282,7 +284,7 @@ class PassConfig:
""" """
enabled_fusions = [ enabled_fusions = [
f.name[len("fuse_") :] f.name[len("fuse_") :]
for f in fields(self) for f in fields(self) # type: ignore[arg-type]
if getattr(self, f.name) and f.name.startswith("fuse_") if getattr(self, f.name) and f.name.startswith("fuse_")
] ]
...@@ -486,9 +488,10 @@ class CompilationConfig: ...@@ -486,9 +488,10 @@ class CompilationConfig:
If empty list [], no ops are excluded (suitable for full cudagraphs).""" If empty list [], no ops are excluded (suitable for full cudagraphs)."""
compile_mm_encoder: bool = False compile_mm_encoder: bool = False
"""Whether or not to compile the multimodal encoder. """Whether or not to compile the multimodal encoder.
Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models on selected
on selected platforms. Disabled by default until more models platforms. It may also work for models loaded with the Transformers modeling backend
are supported/tested to work.""" if the encoder is compilable. Disabled by default until more models are
supported/tested to work."""
# Vision encoder CUDA graph # Vision encoder CUDA graph
cudagraph_mm_encoder: bool = False cudagraph_mm_encoder: bool = False
......
...@@ -805,6 +805,8 @@ class SpeculativeConfig: ...@@ -805,6 +805,8 @@ class SpeculativeConfig:
"deepseek_v3", "deepseek_v3",
"kimi_k2", "kimi_k2",
"kimi_k25", "kimi_k25",
"minimax_m2",
"gemma4",
] ]
if ( if (
self.method in ("eagle3", "extract_hidden_states") self.method in ("eagle3", "extract_hidden_states")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment