Merge tag 'v0.19.1' into v0.19.0

fc67613a · zhuwenwen · 31aec25b · b1388b1f · fc67613a · fc67613a
Commit fc67613a authored Apr 18, 2026 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/generation/test_phi4siglip.py
+++ b/tests/models/multimodal/generation/test_phi4siglip.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from importlib.metadata import version
+import pytest
+import regex as re
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from vllm.logprobs import SampleLogprobs
+from vllm.multimodal.image import rescale_image_size
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptImageInput,
+    VllmRunner,
+)
+from ....utils import multi_gpu_test
+from ...utils import check_logprobs_close
+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "vllm upgraded transformers above v5.4 where HF model custom code uses siglip2 "
+        "internals (filter_out_non_signature_kwargs) removed by "
+        "huggingface/transformers#43514"
+    ),
+)
+MODEL_ID = "microsoft/Phi-4-reasoning-vision-15B"
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<image>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<image>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<image>\n<image>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)
+DTYPE = "half"
+MAX_TOKENS = 128
+NUM_LOGPROBS = 10
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+    output_str_without_image = re.sub(r"(<image>)+", "", output_str)
+    if output_str_without_image and output_str_without_image[0] == " ":
+        output_str_without_image = output_str_without_image[1:]
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    if hf_output_ids and hf_output_ids[0] == tokenizer.bos_token_id:
+        hf_output_ids = hf_output_ids[1:]
+    return hf_output_ids, hf_output_str, out_logprobs
+def _build_single_image_inputs(
+    image_assets,
+) -> list[tuple[list[str], PromptImageInput]]:
+    """Build single-image inputs for all size_factors at once."""
+    images = [asset.pil_image for asset in image_assets]
+    all_inputs: list[tuple[list[str], PromptImageInput]] = []
+    for size_factors in [[1.0], [0.25, 0.5, 1.0]]:
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS):
+            all_inputs.append(
+                (
+                    [prompt for _ in size_factors],
+                    [rescale_image_size(image, f) for f in size_factors],
+                )
+            )
+    return all_inputs
+def _build_multi_image_inputs(
+    image_assets,
+) -> list[tuple[list[str], PromptImageInput]]:
+    """Build multi-image inputs for all size_factors at once."""
+    images = [asset.pil_image for asset in image_assets]
+    all_inputs: list[tuple[list[str], PromptImageInput]] = []
+    for size_factors in [[0.5], [0.15, 0.30]]:
+        all_inputs.append(
+            (
+                [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+                [
+                    [rescale_image_size(image, factor) for image in images]
+                    for factor in size_factors
+                ],
+            )
+        )
+    return all_inputs
+def _run_and_compare(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    all_inputs: Sequence[tuple[list[str], PromptImageInput]],
+    model: str,
+    max_model_len: int,
+    max_num_seqs: int,
+    mm_limit: int,
+    gpu_memory_utilization: float,
+):
+    """Load each runner once, run all inputs, then compare."""
+    # NOTE: run vLLM first, then HF.  vLLM needs a fresh process without
+    # cuda initialization; running HF first would break the multiprocessing
+    # backend with fork method.
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        gpu_memory_utilization=gpu_memory_utilization,
+        dtype=DTYPE,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=2,
+        trust_remote_code=True,
+        enforce_eager=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                MAX_TOKENS,
+                num_logprobs=NUM_LOGPROBS,
+                images=images,
+            )
+            for prompts, images in all_inputs
+        ]
+    hf_model_kwargs = {"_attn_implementation": "sdpa", "device_map": "auto"}
+    with hf_runner(
+        model,
+        dtype=DTYPE,
+        model_kwargs=hf_model_kwargs,
+        auto_cls=AutoModelForCausalLM,
+        trust_remote_code=True,
+    ) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                MAX_TOKENS,
+                num_logprobs=NUM_LOGPROBS,
+                images=images,
+            )
+            for prompts, images in all_inputs
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", [MODEL_ID])
+def test_models(hf_runner, vllm_runner, image_assets, model) -> None:
+    all_inputs = _build_single_image_inputs(image_assets)
+    _run_and_compare(
+        hf_runner,
+        vllm_runner,
+        all_inputs,
+        model,
+        max_model_len=8192,
+        max_num_seqs=2,
+        mm_limit=1,
+        gpu_memory_utilization=0.80,
+    )
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", [MODEL_ID])
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model) -> None:
+    all_inputs = _build_multi_image_inputs(image_assets)
+    _run_and_compare(
+        hf_runner,
+        vllm_runner,
+        all_inputs,
+        model,
+        max_model_len=8192,
+        max_num_seqs=2,
+        mm_limit=2,
+        gpu_memory_utilization=0.80,
+    )
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -149,6 +149,10 @@ def test_online_serving(vllm_runner, audio_assets: AudioTestAssets):
    )
+@pytest.mark.skip(
+    reason="VoxtralProcessor.apply_chat_template() in transformers v5 "
+    "doesn't resolve chat_template=None to the default template"
+)
 def test_hf_reference(hf_runner, vllm_runner, audio_assets: AudioTestAssets):
    """Compare vLLM Mistral-format output against HF Transformers reference.

--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -80,6 +80,11 @@ def run_test(
    if vllm_runner_kwargs:
        vllm_runner_kwargs_.update(vllm_runner_kwargs)
+    # Avoid passing limit_mm_per_prompt twice when vllm_runner_kwargs
+    # already contains it (e.g. gemma4 sets it via vllm_runner_kwargs).
+    if "limit_mm_per_prompt" in vllm_runner_kwargs_:
+        limit_mm_per_prompt = vllm_runner_kwargs_.pop("limit_mm_per_prompt")
    with vllm_runner(
        model,
        max_model_len=max_model_len,

--- a/tests/models/multimodal/pooling/test_colqwen3.py
+++ b/tests/models/multimodal/pooling/test_colqwen3.py
@@ -22,6 +22,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
 from ....conftest import VllmRunner
+pytestmark = pytest.mark.skip(
+    reason="ColQwen3 model's weight tying is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
 MODELS = [
    "TomoroAI/tomoro-colqwen3-embed-4b",
    "OpenSearch-AI/Ops-Colqwen3-4B",

--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -11,6 +11,11 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from ....conftest import ImageTestAssets
+pytestmark = pytest.mark.skip(
+    reason="InternVisionModel's custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]

--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -15,6 +15,11 @@ from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
 from ....conftest import HfRunner, VllmRunner
+pytestmark = pytest.mark.skip(
+    reason="jinaai/jina-reranker-m0 custom code is incompatible with "
+    "transformers v5 (missing all_tied_weights_keys)"
+)
 MODELS = ["jinaai/jina-reranker-m0"]
 MM_PROCESSOR_KWARGS = {

--- a/tests/models/multimodal/processing/test_musicflamingo.py
+++ b/tests/models/multimodal/processing/test_musicflamingo.py
@@ -17,11 +17,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from importlib.metadata import version
 from unittest.mock import MagicMock
 import numpy as np
 import pytest
 import torch
+from packaging.version import Version
 from transformers import PretrainedConfig
 from tests.models.registry import HF_EXAMPLE_MODELS
@@ -122,6 +124,11 @@ def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
    assert builder.get_dummy_text({"audio": 2}) == "<sound><sound>"
+@pytest.mark.skipif(
+    Version(version("transformers")) >= Version("5.5"),
+    reason="transformers v5.5 added native MusicFlamingoForConditionalGeneration "
+    "with a different get_audio_features signature (requires input_ids)",
+)
 def test_musicflamingo_audio_feature_pipeline_matches_hf_small_config():
    from transformers.models.musicflamingo import (
        modeling_musicflamingo as hf_musicflamingo_modeling,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -334,7 +334,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        "internlm/internlm2-chat-7b", trust_remote_code=True
    ),
    "InternLM2VEForCausalLM": _HfExamplesInfo(
-        "OpenGVLab/Mono-InternVL-2B", trust_remote_code=True
+        "OpenGVLab/Mono-InternVL-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `vision_config` is not always set"
+            )
+        },
    ),
    "InternLM3ForCausalLM": _HfExamplesInfo(
        "internlm/internlm3-8b-instruct", trust_remote_code=True
@@ -469,6 +477,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Plamo2ForCausalLM": _HfExamplesInfo(
        "pfnet/plamo-2-1b",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": (
+                "Custom model code uses `_tied_weight_keys: list[str]` but "
+                "Transformers v5 now expects `_tied_weight_keys: dict[str, str]`"
+            )
+        },
    ),
    "Plamo3ForCausalLM": _HfExamplesInfo(
        "pfnet/plamo-3-nict-2b-base",
@@ -509,6 +524,13 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        trust_remote_code=True,
        max_model_len=4096,
        is_available_online=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where "
+                "validate_rope() no longer accepts ignore_keys param"
+            )
+        },
    ),
    "SeedOssForCausalLM": _HfExamplesInfo(
        "ByteDance-Seed/Seed-OSS-36B-Instruct",
@@ -544,6 +566,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
        "xverse/XVERSE-7B-Chat",
        tokenizer="meta-llama/Llama-2-7b",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "XVERSE tokenizer is incompatible with transformers v5 "
+            "(add_prefix_space / prepend_scheme mismatch).",
+        },
    ),
    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"),
    "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True),
@@ -754,10 +781,18 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0"
+        "nvidia/audio-flamingo-3-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
    ),
    "MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
-        "nvidia/music-flamingo-2601-hf", min_transformers_version="5.3.0"
+        "nvidia/music-flamingo-2601-hf",
+        min_transformers_version="5.3.0",
+        transformers_version_reason={
+            "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
+        },
    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
@@ -800,9 +835,30 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
        "allendou/FireRedASR2-LLM-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
+    ),
+    "FireRedLIDForConditionalGeneration": _HfExamplesInfo(
+        "PatchyTisa/FireRedLID-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
    ),
    "FunASRForConditionalGeneration": _HfExamplesInfo(
        "allendou/Fun-ASR-Nano-2512-vllm",
+        trust_remote_code=True,
+        max_transformers_version="5.1",
+        transformers_version_reason={
+            "vllm": "Incompatible with transformers v5.2+ "
+            "(dict object has no attribute '__name__').",
+        },
    ),
    "FunAudioChatForConditionalGeneration": _HfExamplesInfo(
        "funaudiochat", is_available_online=False
@@ -844,6 +900,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "HCXVisionForCausalLM": _HfExamplesInfo(
        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "Custom config cannot be loaded with Transformers "
+                "v5 because `text_config` is not always set"
+            )
+        },
    ),
    "HCXVisionV2ForCausalLM": _HfExamplesInfo(
        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -863,7 +926,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        extras={"0.2-2B-Preview": "PerceptronAI/Isaac-0.2-2B-Preview"},
    ),
    "InternS1ForConditionalGeneration": _HfExamplesInfo(
-        "internlm/Intern-S1", trust_remote_code=True
+        "internlm/Intern-S1",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom tokenizer code is not compatible with Transformers v5."
+        },
    ),
    "InternS1ProForConditionalGeneration": _HfExamplesInfo(
        "internlm/Intern-S1-Pro",
@@ -952,7 +1020,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "MiDashengLMModel": _HfExamplesInfo(
        "mispeech/midashenglm-7b", trust_remote_code=True
    ),
-    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True),
+    "MiniCPMO": _HfExamplesInfo(
+        "openbmb/MiniCPM-o-2_6",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
    "MiniCPMV": _HfExamplesInfo(
        "openbmb/MiniCPM-Llama3-V-2_5",
        extras={
@@ -960,6 +1035,13 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "4.0": "openbmb/MiniCPM-V-4",
            "4.5": "openbmb/MiniCPM-V-4_5",
        },
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "MiniCPMVBatchFeature is incompatible with its base class in "
+                "Transformers v5. See https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5/discussions/78"
+            )
+        },
        trust_remote_code=True,
    ),
    "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo(
@@ -996,13 +1078,25 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "nano_vl_dummy", is_available_online=False, trust_remote_code=True
    ),
    "OpenCUAForConditionalGeneration": _HfExamplesInfo(
-        "xlangai/OpenCUA-7B", trust_remote_code=True
+        "xlangai/OpenCUA-7B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Tokenizer cannot be initialised in Transformers v5."
+        },
    ),
    "OpenPanguVLForConditionalGeneration": _HfExamplesInfo(
        "FreedomIntelligence/openPangu-VL-7B",
        trust_remote_code=True,
        max_model_len=4096,
        enforce_eager=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": (
+                "OpenPanguVLVideoProcessorInitKwargs does not specify total=False, "
+                "making all kwargs required. See https://huggingface.co/FreedomIntelligence/openPangu-VL-7B/discussions/2"
+            )
+        },
    ),
    "Ovis": _HfExamplesInfo(
        "AIDC-AI/Ovis2-1B",
@@ -1014,12 +1108,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B",
        },
    ),
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
+    "Ovis2_5": _HfExamplesInfo(
+        "AIDC-AI/Ovis2.5-2B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
+    ),
    "Ovis2_6ForCausalLM": _HfExamplesInfo(
        "AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
    ),
    "Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
-        "AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
+        "AIDC-AI/Ovis2.6-30B-A3B",
+        trust_remote_code=True,
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "vllm": "Custom processor code is not compatible with Transformers v5."
+        },
    ),
    "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
        "PaddlePaddle/PaddleOCR-VL",
@@ -1038,6 +1144,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        },  # noqa: E501
        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"},
    ),
+    "Phi4ForCausalLMV": _HfExamplesInfo(
+        "microsoft/Phi-4-reasoning-vision-15B",
+        trust_remote_code=True,
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "vllm upgraded transformers above v5.4 where HF model "
+                "custom code uses siglip2 internals "
+                "(filter_out_non_signature_kwargs) removed "
+                "by huggingface/transformers#43514"
+            )
+        },
+    ),
    "Phi4MMForCausalLM": _HfExamplesInfo(
        "microsoft/Phi-4-multimodal-instruct", trust_remote_code=True
    ),
@@ -1133,6 +1252,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
+        max_transformers_version="5.3",
+        transformers_version_reason={
+            "vllm": (
+                "Qwen2VLConfig was split into Qwen2VLConfig + "
+                "Qwen2VLTextConfig in transformers v5, breaking "
+                "attribute access (num_attention_heads, hidden_size, etc.)"
+            )
+        },
    ),
    "VoxtralForConditionalGeneration": _HfExamplesInfo(
        "mistralai/Voxtral-Mini-3B-2507",

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -375,6 +375,7 @@ def softmax(data):
 @dataclass
 class ModelInfo:
    name: str
+    revision: str | None = None
    architecture: str = ""
    dtype: str = "auto"
    max_model_len: int | None = None
@@ -468,7 +469,16 @@ def dummy_hf_overrides(
    else:
        # Use minimal layers for testing
        num_layers = 1
-        num_hidden_layers = 3 if model_arch == "Gemma3nForConditionalGeneration" else 1
+        num_hidden_layers = (
+            3
+            if model_arch
+            in (
+                "Gemma3nForConditionalGeneration",
+                "Gemma4ForCausalLM",
+                "Gemma4ForConditionalGeneration",
+            )
+            else 1
+        )
    update_dict = {
        "num_layers": num_layers,

--- a/tests/reasoning/test_gemma4_reasoning_parser.py
+++ b/tests/reasoning/test_gemma4_reasoning_parser.py
@@ -4,6 +4,9 @@
 import pytest
 from tests.reasoning.utils import run_reasoning_extraction
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 # Using mistral tokenizer as a generic mock since the actual model is not on HF
@@ -100,6 +103,39 @@ NEW_LINE_STREAMING = {
    "is_reasoning_end": True,
 }
+THOUGHT_PREFIX = {
+    "output": "<|channel>thought\nActual reasoning here<channel|>Final answer",
+    "reasoning": "Actual reasoning here",
+    "content": "Final answer",
+    "is_reasoning_end": True,
+}
+THOUGHT_PREFIX_ONLY = {
+    "output": "<|channel>thought\n<channel|>",
+    "reasoning": "",
+    "content": None,
+    "is_reasoning_end": True,
+}
+THOUGHT_PREFIX_MULTILINE = {
+    "output": "<|channel>thought\nLine1\nLine2<channel|>Answer",
+    "reasoning": "Line1\nLine2",
+    "content": "Answer",
+    "is_reasoning_end": True,
+}
+# "thousand" starts like "thought" but diverges — exercises Case 2→3 in streaming.
+THOUGHT_PREFIX_DIVERGE = {
+    "output": "<|channel>thousand reasons<channel|>Done",
+    "reasoning": "thousand reasons",
+    "content": "Done",
+    "is_reasoning_end": True,
+}
+# The model isn't reasoning if we're generating tool calls.
+TOOL_CALL_STARTED = {
+    "output": "<|tool_call>",
+    "reasoning": None,
+    "content": "<|tool_call>",
+    "is_reasoning_end": True,
+}
 TEST_CASES = [
    pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
    pytest.param(True, INVALID_SIMPLE_STREAMING, id="invalid_simple_streaming"),
@@ -120,17 +156,22 @@ TEST_CASES = [
    pytest.param(False, EMPTY, id="empty"),
    pytest.param(False, NEW_LINE_NONSTREAMING, id="new_line"),
    pytest.param(True, NEW_LINE_STREAMING, id="new_line_streaming"),
+    pytest.param(False, THOUGHT_PREFIX, id="thought_prefix"),
+    pytest.param(True, THOUGHT_PREFIX, id="thought_prefix_streaming"),
+    pytest.param(False, THOUGHT_PREFIX_ONLY, id="thought_prefix_only"),
+    pytest.param(True, THOUGHT_PREFIX_ONLY, id="thought_prefix_only_streaming"),
+    pytest.param(False, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline"),
+    pytest.param(
+        True, THOUGHT_PREFIX_MULTILINE, id="thought_prefix_multiline_streaming"
+    ),
+    pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
+    pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
+    pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
+    pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
 ]
-@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
-def test_gemma4_reasoning(
-    streaming: bool,
-    param_dict: dict,
-    generic_tokenizer,
-):
-    output = param_dict["output"]
    # Resolve token IDs dynamically from the real tokenizer
    vocab = generic_tokenizer.get_vocab()
    start_token_id = vocab["<|channel>"]
@@ -176,6 +217,18 @@ def test_gemma4_reasoning(
    else:
        output_tokens += _encode(output)
+    return output_tokens
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_gemma4_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    generic_tokenizer,
+):
+    output = param_dict["output"]
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
        generic_tokenizer
    )
@@ -194,3 +247,29 @@ def test_gemma4_reasoning(
    # Test is_reasoning_end
    is_reasoning_end = parser.is_reasoning_end(output_tokens)
    assert is_reasoning_end == param_dict["is_reasoning_end"]
+def test_gemma4_adjust_request(generic_tokenizer):
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        generic_tokenizer
+    )
+    request = ChatCompletionRequest(messages=[], model="test-model")
+    assert request.skip_special_tokens is True
+    result = parser.adjust_request(request)
+    assert result.skip_special_tokens is False
+    assert result is request
+def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
+    output = (
+        "<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
+        "<|turn>user\nThanks<|turn>model\n"
+    )
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        generic_tokenizer
+    )
+    is_reasoning_end = parser.is_reasoning_end(output_tokens)
+    assert not is_reasoning_end
--- a/tests/reasoning/test_step3p5_reasoning_parser.py
+++ b/tests/reasoning/test_step3p5_reasoning_parser.py
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
-from transformers import AutoTokenizer
 from tests.reasoning.utils import run_reasoning_extraction
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.tokenizers import get_tokenizer
 parser_name = "step3p5"
 start_token = "<think>"
@@ -16,7 +16,7 @@ REASONING_MODEL_NAME = "stepfun-ai/Step-3.5-Flash"
 @pytest.fixture(scope="module")
 def step3p5_tokenizer():
-    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+    return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME)
 SIMPLE_REASONING = {

--- a/tests/renderers/test_gemma4_chat_template.py
+++ b/tests/renderers/test_gemma4_chat_template.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Gemma4 chat template rendering."""
+from pathlib import Path
+import jinja2.sandbox
+import pytest
+TEMPLATE_PATH = (
+    Path(__file__).resolve().parent.parent.parent
+    / "examples"
+    / "tool_chat_template_gemma4.jinja"
+)
+@pytest.fixture(scope="module")
+def gemma4_template():
+    """Load and compile the Gemma4 chat template."""
+    template_str = TEMPLATE_PATH.read_text()
+    env = jinja2.sandbox.ImmutableSandboxedEnvironment()
+    return env.from_string(template_str)
+def _render(template, messages, **kwargs):
+    """Render the template with sensible defaults."""
+    kwargs.setdefault("bos_token", "<bos>")
+    kwargs.setdefault("add_generation_prompt", False)
+    return template.render(messages=messages, **kwargs)
+class TestGemma4ChatTemplate:
+    def test_basic_multiturn_thinking_disabled(self, gemma4_template):
+        """With enable_thinking=False (default), generation prompt ends with
+        an empty thought channel to suppress thinking."""
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+            {"role": "user", "content": "How are you?"},
+        ]
+        result = _render(gemma4_template, messages, add_generation_prompt=True)
+        assert "<|turn>user\n" in result
+        assert "<|turn>model\n" in result
+        assert "Hello" in result
+        assert "Hi there!" in result
+        assert "How are you?" in result
+        assert result.rstrip("\n").endswith("<|channel>thought\n<channel|>")
+    def test_basic_multiturn_thinking_enabled(self, gemma4_template):
+        """With enable_thinking=True, generation prompt ends with model
+        turn opener (no thought suppression)."""
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+            {"role": "user", "content": "How are you?"},
+        ]
+        result = _render(
+            gemma4_template,
+            messages,
+            add_generation_prompt=True,
+            enable_thinking=True,
+        )
+        assert "<|turn>user\n" in result
+        assert "<|turn>model\n" in result
+        assert "Hello" in result
+        assert "Hi there!" in result
+        assert "How are you?" in result
+        assert result.rstrip("\n").endswith("<|turn>model")
+    def test_system_message(self, gemma4_template):
+        messages = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "Hi"},
+        ]
+        result = _render(gemma4_template, messages)
+        assert "<|turn>system\n" in result
+        assert "You are helpful." in result
+    def test_thinking_enabled(self, gemma4_template):
+        messages = [{"role": "user", "content": "Think about this"}]
+        result = _render(
+            gemma4_template,
+            messages,
+            add_generation_prompt=True,
+            enable_thinking=True,
+        )
+        assert "<|think|>" in result
+        assert "<|turn>system\n" in result
+    def test_tool_declarations(self, gemma4_template):
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather for a city",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "City name",
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                },
+            }
+        ]
+        messages = [{"role": "user", "content": "What is the weather?"}]
+        result = _render(
+            gemma4_template,
+            messages,
+            tools=tools,
+            add_generation_prompt=True,
+        )
+        assert "<|tool>" in result
+        assert "declaration:get_weather" in result
+        assert "<tool|>" in result
+        assert '<|"|>City name<|"|>' in result
+    def test_tool_calls_in_assistant(self, gemma4_template):
+        messages = [
+            {"role": "user", "content": "Weather in London?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        },
+                    }
+                ],
+            },
+        ]
+        result = _render(gemma4_template, messages)
+        assert "<|tool_call>call:get_weather{" in result
+        assert "}<tool_call|>" in result
+        assert '<|"|>London<|"|>' in result
+    def test_tool_responses_openai_style(self, gemma4_template):
+        """role='tool' messages are formatted as <|tool_response> blocks
+        with content dumped as-is."""
+        messages = [
+            {"role": "user", "content": "Weather?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_1",
+                "content": '{"temperature": 15, "condition": "sunny"}',
+            },
+        ]
+        result = _render(gemma4_template, messages, add_generation_prompt=True)
+        assert "<|tool_response>" in result
+        assert "response:get_weather{" in result
+        assert "<tool_response|>" in result
+        assert '"temperature": 15' in result
+    def test_tool_responses_legacy_style(self, gemma4_template):
+        """tool_responses embedded on the assistant message."""
+        messages = [
+            {"role": "user", "content": "Weather?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        },
+                    }
+                ],
+                "tool_responses": [
+                    {
+                        "name": "get_weather",
+                        "response": {"temperature": 20},
+                    }
+                ],
+            },
+        ]
+        result = _render(gemma4_template, messages)
+        assert "<|tool_response>" in result
+        assert "response:get_weather{" in result
+        assert "temperature:" in result
+    def test_generation_prompt_not_after_tool_response(self, gemma4_template):
+        """add_generation_prompt=True should NOT add <|turn>model when the
+        last message type was tool_response (the model turn continues)."""
+        messages = [
+            {"role": "user", "content": "Weather?"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {"city": "London"},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call_1",
+                "content": "sunny",
+            },
+        ]
+        result = _render(gemma4_template, messages, add_generation_prompt=True)
+        assert not result.strip().endswith("<|turn>model\n")
+    def test_reasoning_in_tool_chains(self, gemma4_template):
+        """reasoning field on assistant with tool_calls after last user
+        message emits <|channel>thought\\n...<channel|>."""
+        messages = [
+            {"role": "user", "content": "Calculate something"},
+            {
+                "role": "assistant",
+                "content": "",
+                "reasoning": "Let me think about this...",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "calculator",
+                            "arguments": {"expr": "2+2"},
+                        },
+                    }
+                ],
+            },
+        ]
+        result = _render(gemma4_template, messages)
+        assert "<|channel>thought\n" in result
+        assert "Let me think about this..." in result
+        assert "<channel|>" in result
+    def test_reasoning_not_before_last_user(self, gemma4_template):
+        """reasoning on assistant BEFORE the last user message is dropped."""
+        messages = [
+            {"role": "user", "content": "First"},
+            {
+                "role": "assistant",
+                "content": "Response",
+                "reasoning": "Old reasoning that should be dropped",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "fn",
+                            "arguments": {},
+                        },
+                    }
+                ],
+            },
+            {"role": "user", "content": "Second"},
+        ]
+        result = _render(gemma4_template, messages, add_generation_prompt=True)
+        assert "Old reasoning" not in result
+    def test_strip_thinking_in_model_content(self, gemma4_template):
+        """<|channel>...<channel|> in model content is stripped by the
+        strip_thinking macro."""
+        messages = [
+            {"role": "user", "content": "Hi"},
+            {
+                "role": "assistant",
+                "content": ("<|channel>internal thought<channel|>Visible answer"),
+            },
+        ]
+        result = _render(gemma4_template, messages)
+        assert "internal thought" not in result
+        assert "Visible answer" in result
+    def test_multi_turn_tool_chain(self, gemma4_template):
+        """assistant->tool->assistant->tool produces exactly one
+        <|turn>model (later assistants continue the same turn)."""
+        messages = [
+            {"role": "user", "content": "Do two things"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c1",
+                        "function": {"name": "step1", "arguments": {}},
+                    },
+                ],
+            },
+            {"role": "tool", "tool_call_id": "c1", "content": "result1"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "id": "c2",
+                        "function": {"name": "step2", "arguments": {}},
+                    },
+                ],
+            },
+            {"role": "tool", "tool_call_id": "c2", "content": "result2"},
+        ]
+        result = _render(gemma4_template, messages, add_generation_prompt=True)
+        assert result.count("<|turn>model\n") == 1
+    def test_format_argument_types(self, gemma4_template):
+        """Strings wrapped in <|"|>, booleans as true/false, numbers bare."""
+        messages = [
+            {"role": "user", "content": "Test"},
+            {
+                "role": "assistant",
+                "content": "",
+                "tool_calls": [
+                    {
+                        "function": {
+                            "name": "test_fn",
+                            "arguments": {
+                                "name": "Alice",
+                                "active": True,
+                                "count": 42,
+                            },
+                        },
+                    }
+                ],
+            },
+        ]
+        result = _render(gemma4_template, messages)
+        assert '<|"|>Alice<|"|>' in result
+        assert "active:true" in result
+        assert "count:42" in result
--- a/tests/tool_parsers/test_gemma4_tool_parser.py
+++ b/tests/tool_parsers/test_gemma4_tool_parser.py
@@ -85,6 +85,14 @@ class TestParseGemma4Args:
        result = _parse_gemma4_args("flag:false")
        assert result == {"flag": False}
+    def test_null_value(self):
+        # Bare `null` must parse as None (Python), not the string "null".
+        # Without this, tool_choice=auto would emit `{"param": "null"}`
+        # instead of `{"param": null}` for nullable tool parameters.
+        result = _parse_gemma4_args("param:null")
+        assert result == {"param": None}
+        assert json.dumps(result) == '{"param": null}'
    def test_mixed_types(self):
        result = _parse_gemma4_args(
            'name:<|"|>test<|"|>,count:42,active:true,score:3.14'
@@ -114,6 +122,19 @@ class TestParseGemma4Args:
        result = _parse_gemma4_args("key:")
        assert result == {"key": ""}
+    def test_empty_value_partial_withheld(self):
+        """Key with no value is withheld in partial mode to avoid premature emission."""
+        result = _parse_gemma4_args("key:", partial=True)
+        assert result == {}
+        # also with a space after the colon
+        result = _parse_gemma4_args("key: ", partial=True)
+        assert result == {}
+    def test_empty_value_after_other_keys_partial_withheld(self):
+        """Trailing key with no value is withheld; earlier keys are kept."""
+        result = _parse_gemma4_args('name:<|"|>test<|"|>,flag:', partial=True)
+        assert result == {"name": "test"}
 class TestParseGemma4Array:
    def test_string_array(self):
@@ -491,6 +512,51 @@ class TestStreamingExtraction:
            assert parsed_args["count"] == 42
            assert parsed_args["active"] is True
+    def test_streaming_boolean_split_across_chunks(self, parser, mock_request):
+        """Boolean value split across token boundaries must not corrupt JSON."""
+        chunks = [
+            "<|tool_call>",
+            "call:search{input:{all:" + "true"[:3],
+            "e}}",
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text, "No arguments were streamed"
+        parsed_args = json.loads(args_text)
+        assert parsed_args["input"]["all"] is True
+    def test_streaming_false_split_across_chunks(self, parser, mock_request):
+        """Boolean false split across chunks."""
+        chunks = [
+            "<|tool_call>",
+            "call:set{flag:" + "false"[:4],
+            "e}",
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text, "No arguments were streamed"
+        parsed_args = json.loads(args_text)
+        assert parsed_args["flag"] is False
+    def test_streaming_number_split_across_chunks(self, parser, mock_request):
+        """Number split across chunks must not change type."""
+        chunks = [
+            "<|tool_call>",
+            "call:set{count:4",
+            "2}",
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text, "No arguments were streamed"
+        parsed_args = json.loads(args_text)
+        assert parsed_args["count"] == 42
    def test_streaming_empty_args(self, parser, mock_request):
        """Tool call with no arguments."""
        chunks = [
@@ -502,3 +568,119 @@ class TestStreamingExtraction:
        results = self._simulate_streaming(parser, mock_request, chunks)
        name = self._collect_function_name(results)
        assert name == "get_status"
+    def test_streaming_split_delimiter_no_invalid_json(self, parser, mock_request):
+        """Partial <|"|> delimiter chars must not leak into streamed JSON.
+        Reproduces the bug from https://github.com/vllm-project/vllm/issues/38946
+        where a token boundary splits the string delimiter, leaving fragments
+        like '<|' at the end of a parsed value which then corrupt the JSON.
+        """
+        chunks = [
+            "<|tool_call>",
+            "call:todowrite{",
+            'content:<|"|>Buy milk<|',
+            '"|>}',
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text, "No arguments were streamed"
+        # Must be valid JSON — the original bug caused a JSON parse error
+        parsed_args = json.loads(args_text)
+        assert parsed_args["content"] == "Buy milk"
+        # Ensure no raw delimiter fragments leaked into the JSON
+        assert "<|" not in args_text, (
+            f"Partial delimiter leaked into JSON: {args_text!r}"
+        )
+    def test_streaming_does_not_duplicate_plain_text_after_tool_call(
+        self, parser, mock_request, monkeypatch
+    ):
+        """Buffered plain text after a tool call must not corrupt current_text."""
+        captured_current_texts: list[str] = []
+        original_extract_streaming = parser._extract_streaming
+        def wrapped_extract_streaming(previous_text, current_text, delta_text):
+            captured_current_texts.append(current_text)
+            return original_extract_streaming(previous_text, current_text, delta_text)
+        monkeypatch.setattr(parser, "_extract_streaming", wrapped_extract_streaming)
+        chunks = [
+            "<|tool_call>",
+            "call:get_weather{",
+            'location:<|"|>Paris<|"|>}',
+            "<tool_call|><",
+            "div>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        content_parts = [
+            delta.content for delta, _ in results if delta is not None and delta.content
+        ]
+        assert "".join(content_parts) == "<div>"
+        assert captured_current_texts[-1].endswith("<tool_call|><div>")
+        assert not captured_current_texts[-1].endswith("<tool_call|><<div>")
+    def test_streaming_html_argument_does_not_duplicate_tag_prefixes(
+        self, parser, mock_request
+    ):
+        """HTML content inside tool arguments must not be duplicated."""
+        chunks = [
+            "<|tool_call>",
+            "call:write_file{",
+            'path:<|"|>index.html<|"|>,',
+            'content:<|"|><!DOCTYPE html>\n<',
+            'html lang="zh-CN">\n<',
+            "head>\n    <",
+            'meta charset="UTF-8">\n    <',
+            'meta name="viewport" content="width=device-width">\n',
+            '<|"|>}',
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text
+        parsed_args = json.loads(args_text)
+        assert parsed_args["path"] == "index.html"
+        assert (
+            parsed_args["content"] == "<!DOCTYPE html>\n"
+            '<html lang="zh-CN">\n'
+            "<head>\n"
+            '    <meta charset="UTF-8">\n'
+            '    <meta name="viewport" content="width=device-width">\n'
+        )
+    def test_streaming_trailing_bare_bool_not_duplicated(self, parser, mock_request):
+        """Trailing bare boolean must not be streamed twice."""
+        chunks = [
+            "<|tool_call>",
+            "call:Edit{",
+            'file_path:<|"|>src/env.py<|"|>,',
+            'old_string:<|"|>old_val<|"|>,',
+            'new_string:<|"|>new_val<|"|>,',
+            "replace_all:",
+            "false}",
+            "<tool_call|>",
+        ]
+        results = self._simulate_streaming(parser, mock_request, chunks)
+        args_text = self._collect_arguments(results)
+        assert args_text, "No arguments were streamed"
+        parsed_args = json.loads(args_text)
+        assert parsed_args == {
+            "file_path": "src/env.py",
+            "old_string": "old_val",
+            "new_string": "new_val",
+            "replace_all": False,
+        }
+        assert args_text.count("replace_all") == 1
--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py
@@ -542,12 +542,16 @@ def test_eagle_correctness_light(
            "auto",
            0.8,
        ),
-        (
+        pytest.param(
            ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1),
            False,
            False,
            "transformers",
            0.8,
+            # TODO(hmellor): figure out why memory usage is so high
+            marks=pytest.mark.skip(
+                reason="Feature is experimental and uses too much memory in CI",
+            ),
        ),
        pytest.param(
            (

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -3397,3 +3397,38 @@ if hasattr(torch.ops._C, "hadacore_transform"):
    @register_fake("_C::hadacore_transform")
    def _hadacore_transform_fake(x: torch.Tensor, inplace: bool) -> torch.Tensor:
        return torch.empty_like(x) if not inplace else x
+if hasattr(torch.ops._C, "minimax_allreduce_rms"):
+    @register_fake("_C::minimax_allreduce_rms")
+    def _minimax_allreduce_rms_fake(
+        input: torch.Tensor,
+        norm_weight: torch.Tensor,
+        workspace: torch.Tensor,
+        rank: int,
+        nranks: int,
+        eps: float,
+    ) -> torch.Tensor:
+        return torch.empty_like(input)
+if hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
+    @register_fake("_C::minimax_allreduce_rms_qk")
+    def _minimax_allreduce_rms_qk_fake(
+        qkv: torch.Tensor,
+        norm_weight_q: torch.Tensor,
+        norm_weight_k: torch.Tensor,
+        workspace: torch.Tensor,
+        q_size: int,
+        kv_size: int,
+        rank: int,
+        nranks: int,
+        eps: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        token_num = qkv.shape[0]
+        return (
+            torch.empty([token_num, q_size], dtype=qkv.dtype, device=qkv.device),
+            torch.empty([token_num, kv_size], dtype=qkv.dtype, device=qkv.device),
+        )
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -205,6 +205,8 @@ def support_torch_compile(
                if v.annotation in [
                    torch.Tensor,
                    torch.Tensor | None,
+                    torch.FloatTensor,
+                    torch.FloatTensor | None,
                    IntermediateTensors,
                    IntermediateTensors | None,
                ]:
@@ -346,7 +348,7 @@ def _support_torch_compile(
    def __init__(
        self: _T,
-        *,
+        *args,
        vllm_config: VllmConfig | None = None,
        prefix: str = "",
        **kwargs: Any,
@@ -357,11 +359,24 @@ def _support_torch_compile(
        # NOTE: to support multimodal models (such as encoder),
        # we may not have vllm_config so we may need to patch it
        sig = inspect.signature(old_init)
+        # Check that any positional arguments match the old_init method signature
+        annotations = [p.annotation for p in sig.parameters.values()]
+        for arg, annotation in zip(args, annotations):
+            if annotation is inspect._empty:
+                continue
+            if not isinstance(arg, annotation):
+                init = f"'{type(self).__name__}.__init__'"
+                arg_type = f"'{type(arg).__name__}'"
+                raise TypeError(
+                    f"{init} received a positional argument of type {arg_type}, "
+                    "but no parameter of that type was found in the method signature. "
+                    f"Please either annotate {init} or pass it as a keyword argument."
+                )
        if "vllm_config" in sig.parameters:
            kwargs["vllm_config"] = vllm_config
        if "prefix" in sig.parameters:
            kwargs["prefix"] = prefix
-        old_init(self, **kwargs)
+        old_init(self, *args, **kwargs)
        self.vllm_config = vllm_config
        self.compilation_config = self.vllm_config.compilation_config

--- a/vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
+++ b/vllm/compilation/passes/fusion/minimax_qk_norm_fusion.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Fusion pass: replace MiniMax QK allreduce + RMS norm with the Lamport
+fused kernel (minimax_allreduce_rms_qk) for decode-size batches.
+Pattern (inlined forward_qk in compiled graph):
+    q, k, v = qkv.split([q_size, kv_size, kv_size], -1)
+    q_fp32 = q.to(float32); k_fp32 = k.to(float32)
+    q_var = q_fp32.pow(2).mean(-1, keepdim=True)
+    k_var = k_fp32.pow(2).mean(-1, keepdim=True)
+    qk_var = cat([q_var, k_var], -1)
+    qk_var = allreduce(qk_var) / tp_world
+    q_var, k_var = qk_var.chunk(2, -1)
+    q_out = (q_fp32 * rsqrt(q_var + eps) * q_weight).to(orig_dtype)
+    k_out = (k_fp32 * rsqrt(k_var + eps) * k_weight).to(orig_dtype)
+    return q_out, k_out, v
+Replacement (pure, no in-place on qkv/q/k):
+    q_out, k_out = minimax_qk_norm_fused(qkv, q_weight, k_weight, workspace, ...)
+    v = qkv.split([q_size, kv_size, kv_size], -1)[2]
+    return q_out, k_out, v
+is_applicable_for_range: only fires for compile_range.end <= max_decode_tokens
+so that large prefill batches fall through to the original forward_qk (= main).
+"""
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from vllm.config import VllmConfig
+from vllm.config.utils import Range
+from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import direct_register_custom_op
+from ..inductor_pass import enable_fake_mode
+from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+logger = init_logger(__name__)
+MAX_TOKEN_NUM = 2048
+_MINIMAX_QK_NORM_FUSED_OP = None
+if hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
+    def _minimax_qk_norm_fused(
+        qkv: torch.Tensor,
+        norm_weight_q: torch.Tensor,
+        norm_weight_k: torch.Tensor,
+        q_size: int,
+        kv_size: int,
+        rank: int,
+        nranks: int,
+        eps: float,
+        max_tokens: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        from vllm.distributed.parallel_state import get_tp_group
+        from vllm.model_executor.layers.mamba.lamport_workspace import (
+            get_allreduce_workspace,
+        )
+        workspace = get_allreduce_workspace(
+            rank=rank,
+            world_size=nranks,
+            max_tokens=max_tokens,
+            process_group=get_tp_group().cpu_group,
+        )
+        return torch.ops._C.minimax_allreduce_rms_qk(
+            qkv,
+            norm_weight_q,
+            norm_weight_k,
+            workspace,
+            q_size,
+            kv_size,
+            rank,
+            nranks,
+            eps,
+        )
+    def _minimax_qk_norm_fused_fake(
+        qkv: torch.Tensor,
+        norm_weight_q: torch.Tensor,
+        norm_weight_k: torch.Tensor,
+        q_size: int,
+        kv_size: int,
+        rank: int,
+        nranks: int,
+        eps: float,
+        max_tokens: int,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        T = qkv.shape[0]
+        return (
+            torch.empty([T, q_size], dtype=qkv.dtype, device=qkv.device),
+            torch.empty([T, kv_size], dtype=qkv.dtype, device=qkv.device),
+        )
+    direct_register_custom_op(
+        op_name="minimax_qk_norm_fused",
+        op_func=_minimax_qk_norm_fused,
+        fake_impl=_minimax_qk_norm_fused_fake,
+        mutates_args=[],
+    )
+    _MINIMAX_QK_NORM_FUSED_OP = torch.ops.vllm.minimax_qk_norm_fused.default
+class MiniMaxQKNormPattern:
+    """
+    Match the forward_qk allreduce+rms pattern and replace with Lamport kernel.
+    """
+    def __init__(
+        self,
+        q_size: int,
+        kv_size: int,
+        eps: float,
+        tp_world: int,
+        tp_rank: int,
+        max_tokens: int,
+        dtype: torch.dtype,
+        device: str | None,
+    ) -> None:
+        self.q_size = q_size
+        self.kv_size = kv_size
+        self.eps = eps
+        self.tp_world = tp_world
+        self.tp_rank = tp_rank
+        self.max_tokens = max_tokens
+        self.dtype = dtype
+        self.device = device
+    def get_inputs(self) -> list[torch.Tensor]:
+        T = 4
+        qkv = torch.empty(
+            [T, self.q_size + 2 * self.kv_size],
+            device=self.device,
+            dtype=self.dtype,
+        )
+        q_weight = torch.empty([self.q_size], device=self.device, dtype=self.dtype)
+        k_weight = torch.empty([self.kv_size], device=self.device, dtype=self.dtype)
+        return [qkv, q_weight, k_weight]
+    def register(self, pm_pass: PatternMatcherPass) -> None:
+        q_size = self.q_size
+        kv_size = self.kv_size
+        eps = self.eps
+        tp_world = self.tp_world
+        max_tokens = self.max_tokens
+        tp_rank = self.tp_rank
+        dtype = self.dtype
+        def pattern(
+            qkv: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+            q_fp32 = q.to(torch.float32)
+            k_fp32 = k.to(torch.float32)
+            q_var = q_fp32.pow(2).mean(dim=-1, keepdim=True)
+            k_var = k_fp32.pow(2).mean(dim=-1, keepdim=True)
+            qk_var = torch.cat([q_var, k_var], dim=-1)
+            qk_var = tensor_model_parallel_all_reduce(qk_var) / tp_world
+            q_var, k_var = qk_var.chunk(2, dim=-1)
+            q_out = (q_fp32 * torch.rsqrt(q_var + eps) * q_weight).to(dtype)
+            k_out = (k_fp32 * torch.rsqrt(k_var + eps) * k_weight).to(dtype)
+            return q_out, k_out, v
+        def replacement(
+            qkv: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            assert _MINIMAX_QK_NORM_FUSED_OP is not None
+            q_out, k_out = torch.ops.vllm.minimax_qk_norm_fused(
+                qkv,
+                q_weight,
+                k_weight,
+                q_size,
+                kv_size,
+                tp_rank,
+                tp_world,
+                eps,
+                max_tokens,
+            )
+            _, _, v = qkv.split([q_size, kv_size, kv_size], dim=-1)
+            return q_out, k_out, v
+        pm.register_replacement(
+            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+        # Second pattern: three separate split_with_sizes nodes (one per output),
+        # each with _users=1. This occurs when the QKV projection uses a
+        # functional GEMM kernel (e.g. cutlass_scaled_mm via auto_functionalized),
+        # which causes inductor to generate one split per consumer.
+        def pattern_split3(
+            qkv: torch.Tensor,
+            q_weight: torch.Tensor,
+            k_weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            q = qkv.split([q_size, kv_size, kv_size], dim=-1)[0]
+            k = qkv.split([q_size, kv_size, kv_size], dim=-1)[1]
+            v = qkv.split([q_size, kv_size, kv_size], dim=-1)[2]
+            q_fp32 = q.to(torch.float32)
+            k_fp32 = k.to(torch.float32)
+            q_var = q_fp32.pow(2).mean(dim=-1, keepdim=True)
+            k_var = k_fp32.pow(2).mean(dim=-1, keepdim=True)
+            qk_var = torch.cat([q_var, k_var], dim=-1)
+            qk_var = tensor_model_parallel_all_reduce(qk_var) / tp_world
+            q_var, k_var = qk_var.chunk(2, dim=-1)
+            q_out = (q_fp32 * torch.rsqrt(q_var + eps) * q_weight).to(dtype)
+            k_out = (k_fp32 * torch.rsqrt(k_var + eps) * k_weight).to(dtype)
+            return q_out, k_out, v
+        pm.register_replacement(
+            pattern_split3, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+        )
+class MiniMaxQKNormPass(VllmPatternMatcherPass):
+    """
+    Replace forward_qk allreduce+norm with the Lamport fused kernel.
+    Only applied for decode-size compile ranges (small token counts).
+    """
+    def __init__(self, config: VllmConfig) -> None:
+        super().__init__(config)
+        self.disabled = True
+        if _MINIMAX_QK_NORM_FUSED_OP is None:
+            logger.warning_once(
+                "minimax_allreduce_rms_qk op not found, MiniMaxQKNormPass disabled."
+            )
+            return
+        tp_world = get_tensor_model_parallel_world_size()
+        if tp_world <= 1:
+            logger.warning_once("MiniMaxQKNormPass disabled: tp_size <= 1.")
+            return
+        if config.model_config is None:
+            logger.warning_once("MiniMaxQKNormPass disabled: no model_config.")
+            return
+        hf_cfg = config.model_config.hf_config
+        model_name = getattr(hf_cfg, "architectures", "")[0]
+        if model_name != "MiniMaxM2ForCausalLM":
+            return
+        num_attention_heads = getattr(hf_cfg, "num_attention_heads", 0)
+        num_key_value_heads = getattr(hf_cfg, "num_key_value_heads", 0)
+        hidden_size = getattr(hf_cfg, "hidden_size", 0)
+        head_dim = getattr(hf_cfg, "head_dim", 0)
+        eps: float = getattr(hf_cfg, "rms_norm_eps", 1e-6)
+        if (
+            num_attention_heads != 48
+            or num_key_value_heads != 8
+            or hidden_size != 3072
+            or head_dim != 128
+        ):
+            logger.warning_once(
+                "MiniMaxQKNormPass disabled: cannot infer model info from hf_config."
+            )
+            return
+        num_heads_per_rank = num_attention_heads // tp_world
+        num_kv_heads_per_rank = max(1, num_key_value_heads // tp_world)
+        q_size = num_heads_per_rank * head_dim
+        kv_size = num_kv_heads_per_rank * head_dim
+        self.max_token_num = min(
+            MAX_TOKEN_NUM, config.scheduler_config.max_num_batched_tokens
+        )
+        tp_rank = get_tensor_model_parallel_rank()
+        # Allocate Lamport workspace first.
+        from vllm.distributed.parallel_state import get_tp_group
+        from vllm.model_executor.layers.mamba.lamport_workspace import (
+            get_allreduce_workspace,
+        )
+        get_allreduce_workspace(
+            rank=tp_rank,
+            world_size=tp_world,
+            max_tokens=self.max_token_num,
+            process_group=get_tp_group().cpu_group,
+        )
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="minimax_qk_norm_pass"
+        )
+        self._register_patterns(q_size, kv_size, eps, tp_world, tp_rank)
+        self.dump_patterns(config, self.patterns)
+        self.disabled = False
+    @enable_fake_mode
+    def _register_patterns(
+        self,
+        q_size: int,
+        kv_size: int,
+        eps: float,
+        tp_world: int,
+        tp_rank: int,
+    ) -> None:
+        MiniMaxQKNormPattern(
+            q_size=q_size,
+            kv_size=kv_size,
+            eps=eps,
+            tp_world=tp_world,
+            tp_rank=tp_rank,
+            max_tokens=self.max_token_num,
+            dtype=self.model_dtype,
+            device=self.device,
+        ).register(self.patterns)
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        if self.disabled:
+            return False
+        return bool(compile_range.end <= self.max_token_num)
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        if self.disabled:
+            return
+        self.matched_count = self.patterns.apply(graph)
+        logger.debug("MiniMaxQKNormPass replaced %s patterns", self.matched_count)
+    def uuid(self) -> str:
+        return VllmInductorPass.hash_source(self, MiniMaxQKNormPattern)
--- a/vllm/compilation/passes/pass_manager.py
+++ b/vllm/compilation/passes/pass_manager.py
@@ -36,6 +36,7 @@ if current_platform.is_cuda_alike():
 if current_platform.is_cuda():
    from .fusion.allreduce_rms_fusion import AllReduceFusionPass
    from .fusion.collective_fusion import AsyncTPPass
+    from .fusion.minimax_qk_norm_fusion import MiniMaxQKNormPass
 from .inductor_pass import (
    CustomGraphPass,
@@ -124,6 +125,9 @@ class PostGradPassManager(CustomGraphPass):  # type: ignore[misc]
            if self.pass_config.fuse_allreduce_rms:
                self.passes += [AllReduceFusionPass(config)]
+            if self.pass_config.fuse_minimax_qk_norm:
+                self.passes += [MiniMaxQKNormPass(config)]
            if self.pass_config.fuse_norm_quant:
                self.passes += [RMSNormQuantFusionPass(config)]
                if rocm_aiter_ops.is_enabled():

--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -132,6 +132,8 @@ class PassConfig:
    """Enable async TP."""
    fuse_allreduce_rms: bool = None  # type: ignore[assignment]
    """Enable flashinfer allreduce fusion."""
+    fuse_minimax_qk_norm: bool = None  # type: ignore[assignment]
+    """Enable fused allreduce+RMSNorm for MiniMax QK norm."""
    enable_qk_norm_rope_fusion: bool = False
    """Enable fused Q/K RMSNorm + RoPE pass."""
@@ -282,7 +284,7 @@ class PassConfig:
        """
        enabled_fusions = [
            f.name[len("fuse_") :]
-            for f in fields(self)
+            for f in fields(self)  # type: ignore[arg-type]
            if getattr(self, f.name) and f.name.startswith("fuse_")
        ]
@@ -486,9 +488,10 @@ class CompilationConfig:
    If empty list [], no ops are excluded (suitable for full cudagraphs)."""
    compile_mm_encoder: bool = False
    """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models
+    Currently, this only works for `Qwen2_5_vl` and `mLLaMa4` models on selected
-    on selected platforms. Disabled by default until more models
+    platforms. It may also work for models loaded with the Transformers modeling backend
-    are supported/tested to work."""
+    if the encoder is compilable. Disabled by default until more models are
+    supported/tested to work."""
    # Vision encoder CUDA graph
    cudagraph_mm_encoder: bool = False

--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -805,6 +805,8 @@ class SpeculativeConfig:
            "deepseek_v3",
            "kimi_k2",
            "kimi_k25",
+            "minimax_m2",
+            "gemma4",
        ]
        if (
            self.method in ("eagle3", "extract_hidden_states")