Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

a3f8d5dd · zhuwenwen · 8d75f22e · f34eca5f · a3f8d5dd · a3f8d5dd
Commit a3f8d5dd authored Dec 17, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
-from vllm.tokenizers import (
+from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
-    MistralTokenizer,
+from vllm.tokenizers.mistral import MistralTokenizer
-    TokenizerLike,
-    cached_tokenizer_from_config,
-)
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import (

--- a/tests/models/multimodal/processing/test_gemma3.py
+++ b/tests/models/multimodal/processing/test_gemma3.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets, model_id: str
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        processor=hf_processor,
+    )
+    prompt = "<start_of_image>"
+    image_seq_length = hf_processor.image_seq_length
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_kwargs_data = processed_inputs["mm_kwargs"].get_data()
+        num_patches_tensor = mm_kwargs_data["num_patches"]
+        tokens = int(num_patches_tensor.item()) * image_seq_length
+        assert tokens <= max_tokens
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -60,12 +60,12 @@ def test_profiling(model_id: str, max_model_len: int):
        total_num_patches.item() + num_tiles.item() + 3
    )  # image start, image, image end
-    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
+    profiled_tokens = profiler.get_mm_max_tokens(
        max_model_len,
        mm_counts=mm_counts,
    )
-    assert total_tokens == profiled_tokens["image"]
+    assert total_num_patches == profiled_tokens["image"]
    assert total_tokens == sum(
        placeholder.length
        for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]

--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -53,3 +53,38 @@ def test_processor_override(
    assert img_tok_count == expected_toks_per_img * num_imgs
    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
    assert pixel_shape[1] == expected_pixels_shape[1]
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
+@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28])
+def test_get_image_size_with_most_features(
+    image_assets: ImageTestAssets,
+    model_id: str,
+    max_pixels: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"max_pixels": max_pixels},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs: dict[str, object] = {}
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
+    merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size
+    max_image_size = processor.info.get_image_size_with_most_features()
+    max_tokens = processor.info.get_num_image_tokens(
+        image_width=max_image_size.width,
+        image_height=max_image_size.height,
+        image_processor=hf_processor.image_processor,
+    )
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    for asset in image_assets:
+        mm_data = {"image": [asset.pil_image]}
+        processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist()
+        t, h, w = grid_thw[0]
+        tokens = (t * h * w) // (merge_size**2)
+        assert tokens < max_tokens
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -8,6 +8,7 @@ from typing import Any, TypeAlias
 import numpy as np
 import pytest
+import torch
 import torch.nn as nn
 from PIL import Image
@@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
+from ....utils import create_new_process_for_each_test
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import dummy_hf_overrides
 from .test_common import get_model_ids_to_test, get_text_token_prompts
@@ -136,6 +138,7 @@ def create_batched_mm_kwargs(
    )
+# TODO(Isotr0py): Don't initalize model during test
 @contextmanager
 def initialize_dummy_model(
    model_cls: type[nn.Module],
@@ -150,16 +153,21 @@ def initialize_dummy_model(
        backend="nccl",
    )
    initialize_model_parallel(tensor_model_parallel_size=1)
+    current_device = torch.get_default_device()
    vllm_config = VllmConfig(model_config=model_config)
    with set_current_vllm_config(vllm_config=vllm_config):
        with set_default_torch_dtype(model_config.dtype):
+            torch.set_default_device(current_platform.device_type)
            model = model_cls(vllm_config=vllm_config)
+            torch.set_default_device(current_device)
        yield model
    del model
    cleanup_dist_env_and_memory()
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_id", get_model_ids_to_test())
 def test_model_tensor_schema(model_id: str):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,10 +173,7 @@ class _HfExamplesInfo:
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]
-    "AfmoeForCausalLM": _HfExamplesInfo(
+    "AfmoeForCausalLM": _HfExamplesInfo("arcee-ai/Trinity-Nano-Preview"),
-        "arcee-ai/Trinity-Nano",
-        is_available_online=False,
-    ),
    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
@@ -359,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    ),
    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
    "MistralLarge3ForCausalLM": _HfExamplesInfo(
-        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False
+        "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4"
    ),
    "MixtralForCausalLM": _HfExamplesInfo(
        "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -576,12 +573,17 @@ _AUTOMATIC_CONVERTED_MODELS = {
    "Qwen3ForSequenceClassification": _HfExamplesInfo(
        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
    ),
+    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
 }
 _MULTIMODAL_EXAMPLE_MODELS = {
    # [Decoder-only]
    "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AudioFlamingo3ForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/audio-flamingo-3-hf", min_transformers_version="5.0.0.dev"
+    ),
    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereLabs/aya-vision-8b"),
+    "BagelForConditionalGeneration": _HfExamplesInfo("ByteDance-Seed/BAGEL-7B-MoT"),
    "BeeForConditionalGeneration": _HfExamplesInfo(
        "Open-Bee/Bee-8B-RL",
        trust_remote_code=True,
@@ -638,7 +640,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
        "tencent/HunyuanOCR",
-        is_available_online=False,
+        hf_overrides={"num_experts": 0},
    ),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(
        "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -677,8 +679,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
-        "lightonai/LightOnOCR-1B",
+        "lightonai/LightOnOCR-1B-1025"
-        is_available_online=False,
    ),
    "Llama4ForConditionalGeneration": _HfExamplesInfo(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
@@ -782,8 +783,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
            "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512",
        },
        tokenizer_mode="mistral",
-        # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available.
-        is_available_online=False,
    ),
    "QwenVLForConditionalGeneration": _HfExamplesInfo(
        "Qwen/Qwen-VL",
@@ -846,7 +845,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
-    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),
+    "WhisperForConditionalGeneration": _HfExamplesInfo(
+        "openai/whisper-large-v3-turbo",
+        extras={"v3": "openai/whisper-large-v3"},
+    ),
    # [Cross-encoder]
    "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"),
 }
@@ -889,6 +891,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "EagleMistralLarge3ForCausalLM": _HfExamplesInfo(
        "mistralai/Mistral-Large-3-675B-Instruct-2512",
        speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle",
+        # TODO: revert once figuring out OOM in CI
        is_available_online=False,
    ),
    "LlamaForCausalLMEagle3": _HfExamplesInfo(

--- a/tests/multimodal/test_sparse_tensor_validation_unit.py
+++ b/tests/multimodal/test_sparse_tensor_validation_unit.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for sparse tensor validation.
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_sparse_tensor_validation_unit.py -v
+"""
+import io
+import pytest
+import torch
+class TestSparseTensorValidationContextManager:
+    """Test that torch.sparse.check_sparse_tensor_invariants() works as expected."""
+    def test_valid_sparse_tensor_passes(self):
+        """Valid sparse tensors should pass validation."""
+        indices = torch.tensor([[0, 1], [0, 1]])
+        values = torch.tensor([1.0, 2.0])
+        shape = (2, 2)
+        with torch.sparse.check_sparse_tensor_invariants():
+            tensor = torch.sparse_coo_tensor(indices, values, shape)
+            dense = tensor.to_dense()
+        assert dense.shape == shape
+    def test_out_of_bounds_indices_rejected(self):
+        """Sparse tensors with out-of-bounds indices should be rejected."""
+        indices = torch.tensor([[5], [5]])  # Out of bounds for 2x2
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+        with pytest.raises(RuntimeError) as exc_info:  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                tensor = torch.sparse_coo_tensor(indices, values, shape)
+                tensor.to_dense()
+        assert (
+            "index" in str(exc_info.value).lower()
+            or "bound" in str(exc_info.value).lower()
+        )
+    def test_negative_indices_rejected(self):
+        """Sparse tensors with negative indices should be rejected."""
+        indices = torch.tensor([[-1], [0]])
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+        with pytest.raises(RuntimeError):  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                tensor = torch.sparse_coo_tensor(indices, values, shape)
+                tensor.to_dense()
+    def test_without_context_manager_allows_invalid(self):
+        """
+        WITHOUT validation, invalid tensors may not immediately error.
+        This demonstrates the vulnerability: PyTorch 2.8.0+ doesn't validate
+        by default, which can lead to memory corruption.
+        """
+        indices = torch.tensor([[100], [100]])  # Way out of bounds
+        values = torch.tensor([1.0])
+        shape = (2, 2)
+        # Without validation context, this might create an invalid tensor
+        # (actual behavior depends on PyTorch version)
+        tensor = torch.sparse_coo_tensor(indices, values, shape)
+        # The tensor object is created, but it's invalid
+        assert tensor.is_sparse
+class TestTorchLoadWithValidation:
+    """Test torch.load() with sparse tensor validation."""
+    def test_load_valid_sparse_tensor_with_validation(self):
+        """Valid sparse tensors should load successfully with validation."""
+        # Create and save a valid sparse tensor
+        indices = torch.tensor([[0, 1], [0, 1]])
+        values = torch.tensor([1.0, 2.0])
+        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+        # Load with validation
+        with torch.sparse.check_sparse_tensor_invariants():
+            loaded = torch.load(buffer, weights_only=True)
+            dense = loaded.to_dense()
+        assert dense.shape == (2, 2)
+    def test_load_invalid_sparse_tensor_rejected(self):
+        """Invalid sparse tensors should be caught when loaded with validation."""
+        # Create an invalid sparse tensor (out of bounds)
+        indices = torch.tensor([[10], [10]])
+        values = torch.tensor([1.0])
+        tensor = torch.sparse_coo_tensor(indices, values, (2, 2))
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+        # Load with validation - should fail on to_dense()
+        with pytest.raises(RuntimeError):  # noqa: SIM117
+            with torch.sparse.check_sparse_tensor_invariants():
+                loaded = torch.load(buffer, weights_only=True)
+                loaded.to_dense()
+    def test_load_dense_tensor_unaffected(self):
+        """Dense tensors should work normally with the validation context."""
+        # Create and save a dense tensor
+        tensor = torch.randn(10, 20)
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer)
+        buffer.seek(0)
+        # Load with validation (should have no effect on dense tensors)
+        with torch.sparse.check_sparse_tensor_invariants():
+            loaded = torch.load(buffer, weights_only=True)
+        assert loaded.shape == (10, 20)
+        assert not loaded.is_sparse
+if __name__ == "__main__":
+    # Allow running directly for quick testing
+    pytest.main([__file__, "-v", "--tb=short"])
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
 import base64
 import mimetypes
 import os
@@ -8,6 +9,7 @@ from tempfile import NamedTemporaryFile, TemporaryDirectory
 import numpy as np
 import pytest
+import torch
 from PIL import Image, ImageChops
 from vllm.multimodal.image import convert_image_mode
@@ -186,6 +188,7 @@ async def test_fetch_image_error_conversion():
        connector.fetch_image(broken_img)
+@pytest.mark.flaky(reruns=3, reruns_delay=5)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@@ -198,8 +201,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
        }
    )
-    video_sync, metadata_sync = connector.fetch_video(video_url)
+    try:
-    video_async, metadata_async = await connector.fetch_video_async(video_url)
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
+    except (TimeoutError, asyncio.TimeoutError) as e:
+        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async
@@ -404,6 +411,97 @@ def test_argsort_mm_positions(case):
    assert modality_idxs == expected_modality_idxs
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, 5),
+        (torch.tensor([True, True, True, True, True]), 5),
+        (torch.tensor([False, False, False, False, False]), 0),
+        (torch.tensor([True, False, True, False, True]), 3),
+        (torch.tensor([True]), 1),
+    ],
+)
+def test_placeholder_range_get_num_embeds(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_num_embeds == expected
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, None),
+        (
+            torch.tensor([False, True, False, True, True]),
+            torch.tensor([0, 1, 1, 2, 3]),
+        ),
+        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
+    ],
+)
+def test_placeholder_range_embeds_cumsum(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    if expected is None:
+        assert pr.embeds_cumsum is None
+        return
+    assert torch.equal(pr.embeds_cumsum, expected)
+    # cached_property should return the same object on repeated access
+    assert pr.embeds_cumsum is pr.embeds_cumsum
+@pytest.mark.parametrize(
+    "is_embed,start_idx,end_idx,expected",
+    [
+        (None, 2, 4, (2, 4)),
+        (
+            torch.tensor([False, True, False, True, True]),
+            3,
+            5,
+            (1, 3),
+        ),
+        (
+            torch.tensor([False, True, False, True, True]),
+            0,
+            2,
+            (0, 1),
+        ),
+        (
+            torch.tensor([True, False, True, False]),
+            2,
+            2,
+            (1, 1),
+        ),
+    ],
+)
+def test_placeholder_range_get_embeds_indices_in_range(
+    is_embed, start_idx, end_idx, expected
+):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
+@pytest.mark.parametrize(
+    "offset,is_embed,expected",
+    [
+        (0, None, [(0, 4)]),
+        (
+            2,
+            torch.tensor([False, True, False, True, True]),
+            [(3, 3), (5, 6)],
+        ),
+        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
+        (0, torch.tensor([False, False, False, False]), []),
+    ],
+)
+def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
+    assert pr.extract_embeds_range() == expected
 @pytest.mark.asyncio
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 @pytest.mark.parametrize("num_frames", [-1, 32, 1800])

--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
    """
    Regression test for handling videos with broken frames.
    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
-    contains broken/unreadable frames to verify the video loader handles
+    contains broken frames to verify the video loader handles
    them gracefully without crashing and returns accurate metadata.
    """
    with monkeypatch.context() as m:
@@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
            f"Expected fewer than {metadata['total_num_frames']} frames, "
            f"but loaded {frames.shape[0]} frames"
        )
+@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
+class TestVideoBackendOverride1(VideoLoader):
+    """Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
+    @classmethod
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, **kwargs
+    ) -> tuple[npt.NDArray, dict]:
+        return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
+@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
+class TestVideoBackendOverride2(VideoLoader):
+    """Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
+    @classmethod
+    def load_bytes(
+        cls, data: bytes, num_frames: int = -1, **kwargs
+    ) -> tuple[npt.NDArray, dict]:
+        return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
+def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
+    environment variable.
+    This allows users to dynamically select a different video backend
+    via --media-io-kwargs without changing the global env var, which is
+    useful when plugins set a default backend but a specific request
+    needs a different one.
+    """
+    with monkeypatch.context() as m:
+        # Set the env var to one backend
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
+        imageio = ImageMediaIO()
+        # Without video_backend kwarg, should use env var backend
+        videoio_default = VideoMediaIO(imageio, num_frames=10)
+        frames_default, metadata_default = videoio_default.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
+        assert metadata_default["video_backend"] == "test_video_backend_override_1"
+        # With video_backend kwarg, should override env var
+        videoio_override = VideoMediaIO(
+            imageio, num_frames=10, video_backend="test_video_backend_override_2"
+        )
+        frames_override, metadata_override = videoio_override.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
+        assert metadata_override["video_backend"] == "test_video_backend_override_2"
+def test_video_media_io_backend_kwarg_not_passed_to_loader(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """
+    Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
+    through to the underlying video loader's load_bytes method.
+    This ensures the kwarg is properly popped from kwargs before forwarding.
+    """
+    @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
+    class RejectVideoBackendKwargLoader(VideoLoader):
+        """Test loader that fails if video_backend is passed through."""
+        @classmethod
+        def load_bytes(
+            cls, data: bytes, num_frames: int = -1, **kwargs
+        ) -> tuple[npt.NDArray, dict]:
+            # This should never receive video_backend in kwargs
+            if "video_backend" in kwargs:
+                raise AssertionError(
+                    "video_backend should be consumed by VideoMediaIO, "
+                    "not passed to loader"
+                )
+            return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
+        imageio = ImageMediaIO()
+        # Even when video_backend is provided, it should NOT be passed to loader
+        videoio = VideoMediaIO(
+            imageio,
+            num_frames=10,
+            video_backend="test_reject_video_backend_kwarg",
+            other_kwarg="should_pass_through",
+        )
+        # This should NOT raise AssertionError
+        frames, metadata = videoio.load_bytes(b"test")
+        np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
+        # Verify other kwargs are still passed through
+        assert "other_kwarg" in metadata["received_kwargs"]
+def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that when video_backend kwarg is None or not provided,
+    VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
+        imageio = ImageMediaIO()
+        # Explicit None should fall back to env var
+        videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
+        frames_none, metadata_none = videoio_none.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
+        assert metadata_none["video_backend"] == "test_video_backend_override_2"
+        # Not providing video_backend should also fall back to env var
+        videoio_missing = VideoMediaIO(imageio, num_frames=10)
+        frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
+        np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
+        assert metadata_missing["video_backend"] == "test_video_backend_override_2"
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -10,9 +10,9 @@ import pytest
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
-if not current_platform.is_device_capability(100):
+if not current_platform.is_device_capability_family(100):
    pytest.skip(
-        "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
+        "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
    )

--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
    task = "wikitext"
    rtol = 0.1
-    # Smaller cuda_graph_sizes to speed up the test.
+    # Smaller cudagraph_capture_sizes to speed up the test.
    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=config.get_model_args(
-            tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]}
+            tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]}
        ),
        tasks=task,
        batch_size=64,

--- a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from transformers import AutoTokenizer
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+parser_name = "minimax_m2_append_think"
+end_token = "</think>"
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+# =============================================================================
+# MiniMaxM2AppendThinkReasoningParser behavior:
+# - Prepends <think> to the beginning of the output
+# - Does NOT separate reasoning and content
+# - Returns everything as content (with <think> prepended)
+# - reasoning is always None
+#
+# This parser is used when you want to keep the raw output with <think> added
+# =============================================================================
+# Case: simple output with end token
+SIMPLE_OUTPUT = {
+    "output": "This is reasoning</think>This is response",
+    "reasoning": None,
+    "content": "<think>This is reasoning</think>This is response",
+    "is_reasoning_end": True,
+}
+# Case: output without end token (reasoning in progress)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": None,
+    "content": "<think>This is reasoning in progress",
+    "is_reasoning_end": False,
+}
+# Case: only end token
+ONLY_END_TOKEN = {
+    "output": "</think>This is response",
+    "reasoning": None,
+    "content": "<think></think>This is response",
+    "is_reasoning_end": True,
+}
+# Case: multiple lines
+MULTIPLE_LINES = {
+    "output": "Line 1\nLine 2</think>Response 1\nResponse 2",
+    "reasoning": None,
+    "content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
+    "is_reasoning_end": True,
+}
+# Case: empty output (non-streaming prepends <think>)
+EMPTY = {
+    "output": "",
+    "reasoning": None,
+    "content": "<think>",
+    "is_reasoning_end": False,
+}
+# Case: empty output streaming (no tokens = no output)
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+# Case: special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2</think>Yes!",
+    "reasoning": None,
+    "content": "<think>Let me think... 1+1=2</think>Yes!",
+    "is_reasoning_end": True,
+}
+# Case: code in output
+CODE_OUTPUT = {
+    "output": "```python\nprint('hi')\n```</think>Here's the code.",
+    "reasoning": None,
+    "content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
+    "is_reasoning_end": True,
+}
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_OUTPUT,
+        id="simple_output",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_OUTPUT,
+        id="simple_output_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        ONLY_END_TOKEN,
+        id="only_end_token",
+    ),
+    pytest.param(
+        True,
+        ONLY_END_TOKEN,
+        id="only_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_OUTPUT,
+        id="code_output",
+    ),
+    pytest.param(
+        True,
+        CODE_OUTPUT,
+        id="code_output_streaming",
+    ),
+]
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
--- a/tests/reasoning/test_minimax_m2_reasoning_parser.py
+++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from transformers import AutoTokenizer
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+parser_name = "minimax_m2"
+end_token = "</think>"
+# MiniMax M2 model path
+REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+@pytest.fixture(scope="module")
+def minimax_m2_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+# =============================================================================
+# MiniMax M2 specific behavior:
+# - Model does NOT generate <think> start token
+# - Model only generates </think> end token
+# - All content before </think> is reasoning
+# - All content after </think> is the actual response (content)
+# =============================================================================
+# Case: reasoning + end token + content (typical case)
+SIMPLE_REASONING = {
+    "output": "This is a reasoning section</think>This is the rest",
+    "reasoning": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+# Case: reasoning + end token only (no content after)
+COMPLETE_REASONING = {
+    "output": "This is a reasoning section</think>",
+    "reasoning": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+# Case: no end token yet (streaming in progress, all is reasoning)
+NO_END_TOKEN = {
+    "output": "This is reasoning in progress",
+    "reasoning": "This is reasoning in progress",
+    "content": None,
+    "is_reasoning_end": False,
+}
+# Case: multiple lines of reasoning
+MULTIPLE_LINES = {
+    "output": "First line\nSecond line</think>Response first line\nResponse second",
+    "reasoning": "First line\nSecond line",
+    "content": "Response first line\nResponse second",
+    "is_reasoning_end": True,
+}
+# Case: only end token (empty reasoning, immediate response)
+SHORTEST_REASONING_NO_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": "",
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+# Case: only end token streaming (reasoning is None because it's just the token)
+SHORTEST_REASONING_STREAMING = {
+    "output": "</think>This is the response",
+    "reasoning": None,
+    "content": "This is the response",
+    "is_reasoning_end": True,
+}
+# Case: empty output
+EMPTY = {
+    "output": "",
+    "reasoning": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+# Case: empty streaming
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+# Case: long reasoning with special characters
+SPECIAL_CHARS = {
+    "output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
+    "reasoning": "Let me think... 1+1=2, right?",
+    "content": "Yes, 1+1=2.",
+    "is_reasoning_end": True,
+}
+# Case: reasoning with code blocks
+CODE_IN_REASONING = {
+    "output": "```python\nprint('hello')\n```</think>Here is the code.",
+    "reasoning": "```python\nprint('hello')\n```",
+    "content": "Here is the code.",
+    "is_reasoning_end": True,
+}
+TEST_CASES = [
+    # Core cases: no start token (MiniMax M2 actual behavior)
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        NO_END_TOKEN,
+        id="no_end_token",
+    ),
+    pytest.param(
+        True,
+        NO_END_TOKEN,
+        id="no_end_token_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING,
+        id="shortest_reasoning",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_STREAMING,
+        id="shortest_reasoning_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        SPECIAL_CHARS,
+        id="special_chars",
+    ),
+    pytest.param(
+        True,
+        SPECIAL_CHARS,
+        id="special_chars_streaming",
+    ),
+    pytest.param(
+        False,
+        CODE_IN_REASONING,
+        id="code_in_reasoning",
+    ),
+    pytest.param(
+        True,
+        CODE_IN_REASONING,
+        id="code_in_reasoning_streaming",
+    ),
+]
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    minimax_m2_tokenizer,
+):
+    output = minimax_m2_tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        minimax_m2_tokenizer
+    )
+    reasoning, content = run_reasoning_extraction(
+        parser, output_tokens, streaming=streaming
+    )
+    assert reasoning == param_dict["reasoning"]
+    assert content == param_dict["content"]
+    # Test is_reasoning_end
+    output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
+            minimax_m2_tokenizer.tokenize(param_dict["content"])
+        )
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@ import pytest
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 parser_name = "mistral"
@@ -18,47 +18,53 @@ def mistral_tokenizer():
    return mistral_tokenizer
-SIMPLE_REASONING = {
+INVALID_SIMPLE_REASONING = {
    "output": "This is a reasoning section[/THINK]This is the rest",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": "This is the rest",
+    "content": "This is a reasoning sectionThis is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-COMPLETE_REASONING = {
+INVALID_COMPLETE_REASONING = {
    "output": "This is a reasoning section[/THINK]",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": None,
+    "content": "This is a reasoning section",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 NO_CONTENT = {
-    "output": "This is content",
+    "output": "[THINK]This is reasoning",
-    "reasoning": "This is content",
+    "reasoning": "This is reasoning",
    "content": None,
    "is_reasoning_end": False,
 }
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning": None,
+    "content": "This is content",
+    "is_reasoning_end": False,
+}
 NO_REASONING_STREAMING = {
    "output": "This is a reasoning section",
-    "reasoning": "This is a reasoning section",
+    "reasoning": None,
-    "content": None,
+    "content": "This is a reasoning section",
    "is_reasoning_end": False,
 }
-MULTIPLE_LINES = {
+INVALID_MULTIPLE_LINES = {
    "output": "This\nThat[/THINK]This is the rest\nThat",
-    "reasoning": "This\nThat",
+    "reasoning": None,
-    "content": "This is the rest\nThat",
+    "content": "This\nThatThis is the rest\nThat",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-SHORTEST_REASONING_NO_STREAMING = {
+INVALID_SHORTEST_REASONING_NO_STREAMING = {
    "output": "[/THINK]This is the rest",
-    "reasoning": "",
+    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-SHORTEST_REASONING = {
+INVALID_SHORTEST_REASONING = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 REASONING_WITH_THINK = {
    "output": "[THINK]This is a reasoning section[/THINK]This is the rest",
@@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = {
    "content": "This is the rest\nThat",
    "is_reasoning_end": True,
 }
-SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
    "output": "[/THINK]This is the rest",
-    "reasoning": "",
+    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
-SHORTEST_REASONING_WITH_THINK = {
+INVALID_SHORTEST_REASONING_WITH_THINK = {
    "output": "[/THINK]This is the rest",
    "reasoning": None,
    "content": "This is the rest",
-    "is_reasoning_end": True,
+    "is_reasoning_end": False,
 }
 THINK_NO_END = {
    "output": "[THINK]This is a reasoning section",
@@ -98,8 +104,8 @@ THINK_NO_END = {
 }
 EMPTY = {
    "output": "",
-    "reasoning": "",
+    "reasoning": None,
-    "content": None,
+    "content": "",
    "is_reasoning_end": False,
 }
 EMPTY_STREAMING = {
@@ -109,47 +115,48 @@ EMPTY_STREAMING = {
    "is_reasoning_end": False,
 }
 NEW_LINE = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
    "reasoning": "This is a reasoning section",
-    "content": "\nThis is the rest",
+    "content": "Before\n\nThis is the rest",
    "is_reasoning_end": True,
 }
-# Streaming cannot handle new lines at the beginning of the output
-# because we need to support [THINK]...[/THINK] and [/THINK]...
-# We cannot know if the text before [THINK] is reasoning content
-# or not.
 NEW_LINE_STREAMING = {
-    "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
+    "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest",
-    "reasoning": "\nThis is a reasoning section",
+    "reasoning": "This is a reasoning section",
-    "content": "\nThis is the rest",
+    "content": "Before\n\nThis is the rest",
    "is_reasoning_end": True,
 }
 TEST_CASES = [
    pytest.param(
        False,
-        SIMPLE_REASONING,
+        INVALID_SIMPLE_REASONING,
-        id="simple_reasoning",
+        id="invalid_simple_reasoning",
    ),
    pytest.param(
        True,
-        SIMPLE_REASONING,
+        INVALID_SIMPLE_REASONING,
-        id="simple_reasoning_streaming",
+        id="invalid_simple_reasoning_streaming",
    ),
    pytest.param(
        False,
-        COMPLETE_REASONING,
+        INVALID_COMPLETE_REASONING,
-        id="complete_reasoning",
+        id="invalid_complete_reasoning",
    ),
    pytest.param(
        True,
-        COMPLETE_REASONING,
+        INVALID_COMPLETE_REASONING,
-        id="complete_reasoning_streaming",
+        id="invalid_complete_reasoning_streaming",
    ),
    pytest.param(
        False,
        NO_CONTENT,
-        id="no_content_token",
+        id="no_content",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
    ),
    pytest.param(
        True,
@@ -158,23 +165,23 @@ TEST_CASES = [
    ),
    pytest.param(
        False,
-        MULTIPLE_LINES,
+        INVALID_MULTIPLE_LINES,
-        id="multiple_lines",
+        id="invalid_multiple_lines",
    ),
    pytest.param(
        True,
-        MULTIPLE_LINES,
+        INVALID_MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        id="invalid_multiple_lines_streaming",
    ),
    pytest.param(
        True,
-        SHORTEST_REASONING,
+        INVALID_SHORTEST_REASONING,
-        id="shortest",
+        id="invalid_shortest",
    ),
    pytest.param(
        False,
-        SHORTEST_REASONING_NO_STREAMING,
+        INVALID_SHORTEST_REASONING_NO_STREAMING,
-        id="shortest_streaming",
+        id="invalid_shortest_streaming",
    ),
    pytest.param(
        False,
@@ -208,13 +215,13 @@ TEST_CASES = [
    ),
    pytest.param(
        False,
-        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
-        id="shortest_with_think",
+        id="invalid_shortest_with_think",
    ),
    pytest.param(
        True,
-        SHORTEST_REASONING_WITH_THINK,
+        INVALID_SHORTEST_REASONING_WITH_THINK,
-        id="shortest_with_think_streaming",
+        id="invalid_shortest_with_think_streaming",
    ),
    pytest.param(
        False,
@@ -316,10 +323,26 @@ def test_mistral_reasoning(
    # Test extract_content
    if param_dict["content"] is not None:
-        content = parser.extract_content_ids(output_tokens)
+        # Handle the case where there are tokens outputted before Thinking.
-        assert content == mistral_tokenizer.tokenizer.encode(
+        # This should not occur if the model is well trained and prompted.
-            param_dict["content"], bos=False, eos=False
+        if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith(
+            "[THINK]"
+        ):
+            before_content = param_dict["output"].split("[THINK]")[0]
+            before_token_ids = mistral_tokenizer.tokenizer.encode(
+                before_content, bos=False, eos=False
+            )
+            left_to_encode = param_dict["content"][len(before_content) :]
+        # Normal situation.
+        else:
+            before_token_ids = []
+            left_to_encode = param_dict["content"]
+        content_tokens = parser.extract_content_ids(output_tokens)
+        expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode(
+            left_to_encode, bos=False, eos=False
        )
+        assert content_tokens == expected_token_ids
    else:
        content = parser.extract_content_ids(output_tokens)
        assert content == []
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 class StreamingReasoningReconstructor:

--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -3,12 +3,45 @@
 # for users who do not have any compilers installed on their system
 set -e
-set -x
 merge_base_commit=$(git merge-base HEAD origin/main)
-echo "Current merge base commit with main: $merge_base_commit"
+echo "INFO: current merge base commit with main: $merge_base_commit"
 git show --oneline -s $merge_base_commit
+# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times
+# this avoids cumbersome error messages & manual retries in case the precompiled wheel
+# for the given commit is still being built in the release pipeline
+meta_json_url="https://wheels.vllm.ai/$merge_base_commit/vllm/metadata.json"
+echo "INFO: will use metadata.json from $meta_json_url"
+for i in {1..5}; do
+    echo "Checking metadata.json URL (attempt $i)..."
+    if curl --fail "$meta_json_url" > metadata.json; then
+        echo "INFO: metadata.json URL is valid."
+        # check whether it is valid json by python
+        if python3 -m json.tool metadata.json; then
+            echo "INFO: metadata.json is valid JSON. Proceeding with the test."
+        else
+            echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!"
+            exit 1
+        fi
+        break
+    fi
+    # failure handling
+    if [ $i -eq 5 ]; then
+        echo "ERROR: metadata.json URL is still not valid after 5 attempts."
+        echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists."
+        echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes."
+        echo " NOTE: If it fails, please report in #sig-ci channel."
+        exit 1
+    else
+        echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..."
+        sleep 180
+    fi
+done
+set -x
 cd /vllm-workspace/
 # uninstall vllm
@@ -29,6 +62,6 @@ python3 -c 'import vllm'
 # Check if the clangd log file was created
 if [ ! -f /tmp/changed.file ]; then
-    echo "changed.file was not created, python only compilation failed"
+    echo "ERROR: changed.file was not created, python only compilation failed"
    exit 1
 fi
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -89,64 +89,6 @@ def test_update_config():
        new_config3 = update_config(config3, {"a": "new_value"})
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("distilbert/distilgpt2", "generate", "none", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
-        ("openai/whisper-small", "generate", "none", "transcription"),
-    ],
-)
-def test_auto_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="auto")
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
-        ("openai/whisper-small", "pooling", "embed", "embed"),
-    ],
-)
-def test_score_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="score")
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
-# Can remove once --task option is fully deprecated
-@pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
-    [
-        ("openai/whisper-small", "generate", "none", "transcription"),
-    ],
-)
-def test_transcription_task(
-    model_id, expected_runner_type, expected_convert_type, expected_task
-):
-    config = ModelConfig(model_id, task="transcription")
-    assert config.runner_type == expected_runner_type
-    assert config.convert_type == expected_convert_type
 @pytest.mark.parametrize(
    ("model_id", "expected_runner_type", "expected_convert_type"),
    [
@@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
    )
    # Override one field but not others
-    pass_config = PassConfig(enable_noop=False)
+    pass_config = PassConfig(eliminate_noops=False)
    compilation_config = CompilationConfig(pass_config=pass_config)
    config = VllmConfig(
        model_config=regular_model,

--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -8,6 +8,7 @@ import pytest
 import vllm.envs as envs
 from vllm.envs import (
+    disable_envs_cache,
    enable_envs_cache,
    env_list_with_choices,
    env_set_with_choices,
@@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
    envs.__getattr__ = envs.__getattr__.__wrapped__
+def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
+    # __getattr__ is not decorated with functools.cache
+    assert not hasattr(envs.__getattr__, "cache_info")
+    # Enable envs cache and ignore ongoing environment changes
+    enable_envs_cache()
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+    # With cache enabled, the environment variable value is cached and unchanged
+    monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2")
+    assert envs.VLLM_HOST_IP == "1.1.1.1"
+    disable_envs_cache()
+    assert envs.VLLM_HOST_IP == "2.2.2.2"
+    # After cache disabled, the environment variable value would be synced
+    # with os.environ
+    monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3")
+    assert envs.VLLM_HOST_IP == "3.3.3.3"
+def test_is_envs_cache_enabled() -> None:
+    assert not envs._is_envs_cache_enabled()
+    enable_envs_cache()
+    assert envs._is_envs_cache_enabled()
+    # Only wrap one-layer of cache, so we only need to
+    # call disable once to reset.
+    enable_envs_cache()
+    enable_envs_cache()
+    enable_envs_cache()
+    disable_envs_cache()
+    assert not envs._is_envs_cache_enabled()
+    disable_envs_cache()
+    assert not envs._is_envs_cache_enabled()
 class TestEnvWithChoices:
    """Test cases for env_with_choices function."""

--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -7,7 +7,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import zip_enc_dec_prompts
 from vllm.inputs.parse import parse_raw_prompts
 from vllm.inputs.preprocess import InputPreprocessor
-from vllm.tokenizers import init_tokenizer_from_config
+from vllm.tokenizers import cached_tokenizer_from_config
 pytestmark = pytest.mark.cpu_test
@@ -34,6 +34,13 @@ INPUTS_SLICES = [
 ]
+# Test that a nested mixed-type list of lists raises a TypeError.
+@pytest.mark.parametrize("invalid_input", [[[1, 2], ["foo", "bar"]]])
+def test_invalid_input_raise_type_error(invalid_input):
+    with pytest.raises(TypeError):
+        parse_raw_prompts(invalid_input)
 def test_parse_raw_single_batch_empty():
    with pytest.raises(ValueError, match="at least one prompt"):
        parse_raw_prompts([])
@@ -108,7 +115,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
 )
 def test_preprocessor_always_mm_code_path(model_id, prompt):
    model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_config(model_config)
+    tokenizer = cached_tokenizer_from_config(model_config)
    input_preprocessor = InputPreprocessor(model_config, tokenizer)
    # HF processor adds sep token

--- a/tests/tokenizers_/test_basic.py
+++ b/tests/tokenizers_/test_basic.py
@@ -3,38 +3,39 @@
 from typing import _get_protocol_attrs  # type: ignore
 import pytest
-from transformers import PreTrainedTokenizerBase
+from transformers import (
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
 from vllm.tokenizers import TokenizerLike, get_tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 def _get_missing_attrs(obj: object, target: type):
    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
+def _assert_tokenizer_like(tokenizer: object):
+    missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike)
+    assert not missing_attrs, f"Missing attrs: {missing_attrs}"
 def test_tokenizer_like_protocol():
-    assert not (
+    tokenizer = get_tokenizer("gpt2", use_fast=False)
-        missing_attrs := _get_missing_attrs(
+    assert isinstance(tokenizer, PreTrainedTokenizer)
-            get_tokenizer("gpt2", use_fast=False),
+    _assert_tokenizer_like(tokenizer)
-            TokenizerLike,
-        )
+    tokenizer = get_tokenizer("gpt2", use_fast=True)
-    ), f"Missing attrs: {missing_attrs}"
+    assert isinstance(tokenizer, PreTrainedTokenizerFast)
+    _assert_tokenizer_like(tokenizer)
-    assert not (
-        missing_attrs := _get_missing_attrs(
+    tokenizer = get_tokenizer(
-            get_tokenizer("gpt2", use_fast=True),
+        "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
-            TokenizerLike,
+    )
-        )
+    assert isinstance(tokenizer, MistralTokenizer)
-    ), f"Missing attrs: {missing_attrs}"
+    _assert_tokenizer_like(tokenizer)
-    assert not (
-        missing_attrs := _get_missing_attrs(
-            get_tokenizer(
-                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
-            ),
-            TokenizerLike,
-        )
-    ), f"Missing attrs: {missing_attrs}"
 @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])