Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
    decode_pooling_output,
 )

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"


+if current_platform.is_rocm():
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+
+
 @pytest.fixture(scope="module")
 def server():
    args = [
@@ -53,6 +57,10 @@ def server():
        DUMMY_CHAT_TEMPLATE,
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
    EmbedModelInfo(
@@ -62,6 +57,10 @@ def server(model_info, dtype: str):
            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
        )

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model_info.name, args) as remote_server:
        yield remote_server


--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-

 def _generate_random_text(word_count: int) -> str:
    """Generate random text with approximately the specified word count."""
@@ -221,13 +216,17 @@ def server_with_chunked_processing():
        "512",  # Set smaller max_model_len to trigger chunking mechanism
        "--pooler-config",
        (
-            '{"pooling_type": "MEAN", "normalize": true, '
+            '{"pooling_type": "MEAN", "use_activation": true, '
            '"enable_chunked_processing": true, "max_embed_len": 10000}'
        ),
        "--gpu-memory-utilization",
        "0.8",
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -11,7 +11,7 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import fetch_image

 from ...utils import models_path_prefix, urls_port

@@ -55,14 +55,6 @@ def server():
        yield remote_server


-@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
-    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
-        for image_url in TEST_IMAGE_ASSETS
-    }
-
-
 def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4

--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@@ -4,7 +4,7 @@ import os

 import pytest

-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
    MTEB_RERANK_LANGS,
    MTEB_RERANK_TASKS,
    MTEB_RERANK_TOL,
@@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@@ -30,6 +25,10 @@ st_main_score = 0.33457
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"


 @pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@@ -30,6 +31,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)

--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"

@@ -24,6 +19,10 @@ DTYPE = "bfloat16"
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@@ -44,6 +39,10 @@ def model(request):
 def server(model: dict[str, Any]):
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model["name"], args) as remote_server:
        yield remote_server

@@ -237,17 +236,14 @@ class TestModel:
                    "use_activation": use_activation,
                },
            )
-            if response.status_code != 200:
-                return response
-
            outputs = response.json()
            return torch.tensor([x["score"] for x in outputs["data"]])

-        if model["is_cross_encoder"]:
-            default = get_outputs(use_activation=None)
-            w_activation = get_outputs(use_activation=True)
-            wo_activation = get_outputs(use_activation=False)
+        default = get_outputs(use_activation=None)
+        w_activation = get_outputs(use_activation=True)
+        wo_activation = get_outputs(use_activation=False)

+        if model["is_cross_encoder"]:
            assert torch.allclose(default, w_activation, atol=1e-2), (
                "Default should use activation."
            )
@@ -257,9 +253,3 @@ class TestModel:
            assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
                "w_activation should be close to activation(wo_activation)."
            )
-        else:
-            get_outputs(use_activation=None)
-
-            # The activation parameter only works for the is_cross_encoder model
-            response = get_outputs(use_activation=True)
-            assert response.status_code == 400
--- a/tests/entrypoints/pooling/score/test_utils.py
+++ b/tests/entrypoints/pooling/score/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
+from vllm.entrypoints.score_utils import get_score_prompt
+from vllm.inputs import TokensPrompt
+from vllm.tokenizers import get_tokenizer
+
+# A cross-encoder model for testing
+CROSS_ENCODER_MODEL_ID = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+
+
+def assert_prompt_tokenization_consistent(
+    tokenizer, full_prompt, engine_prompt, add_special_tokens=True
+):
+    """Verify that engine_prompt token_ids match tokenizing full_prompt."""
+    expected_ids = tokenizer(full_prompt, add_special_tokens=add_special_tokens)[
+        "input_ids"
+    ]
+    actual_ids = engine_prompt["prompt_token_ids"]
+    assert actual_ids == expected_ids, (
+        f"Token IDs don't match.\nExpected: {expected_ids}\nActual:   {actual_ids}"
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_model_config():
+    return ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+
+
+@pytest.fixture(scope="module")
+def cross_encoder_tokenizer(cross_encoder_model_config):
+    return get_tokenizer(
+        CROSS_ENCODER_MODEL_ID,
+        trust_remote_code=cross_encoder_model_config.trust_remote_code,
+    )
+
+
+@pytest.fixture(scope="module")
+def llm_reranker_model_config():
+    """Model config for LLM-as-reranker style (no pad token)."""
+    config = ModelConfig(
+        CROSS_ENCODER_MODEL_ID,
+        runner="pooling",
+    )
+    # use_sep_token is a property that reads from hf_config,
+    # so we set it there to override the default (True)
+    config.hf_config.use_sep_token = False
+    return config
+
+
+@pytest.fixture
+def tokenization_kwargs():
+    """Common tokenization kwargs used across tests."""
+    return {"add_special_tokens": True, "return_tensors": None}
+
+
+@pytest.fixture
+def mock_model_with_score_template():
+    """Mock model class that supports score template and tracks post_process calls."""
+
+    class MockModelWithScoreTemplate:
+        supports_score_template = True
+        post_process_called: list[TokensPrompt] = []
+
+        @staticmethod
+        def get_score_template(p1: str, p2: str) -> str:
+            return f"[QUERY]{p1}[SEP][DOC]{p2}"
+
+        @staticmethod
+        def post_process_tokens(prompt: TokensPrompt) -> None:
+            MockModelWithScoreTemplate.post_process_called.append(prompt)
+
+    return MockModelWithScoreTemplate
+
+
+@pytest.fixture
+def mock_model_no_score_template():
+    """Mock model class that does not support score template."""
+
+    class MockModelNoScoreTemplate:
+        supports_score_template = False
+
+    return MockModelNoScoreTemplate
+
+
+class TestGetScorePrompt:
+    """Tests for the get_score_prompt function."""
+
+    def test_tokenization_kwargs_passed_through(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+    ):
+        """Test that tokenization kwargs are properly passed through."""
+        data_1 = "Query text"
+        data_2 = "Document text"
+
+        # Test with truncation - custom kwargs for this test
+        custom_tokenization_kwargs = {
+            "add_special_tokens": True,
+            "return_tensors": None,
+            "truncation": True,
+            "max_length": 20,
+        }
+
+        full_prompt, engine_prompt = get_score_prompt(
+            llm_reranker_model_config,
+            cross_encoder_tokenizer,
+            custom_tokenization_kwargs,
+            data_1,
+            data_2,
+        )
+
+        assert isinstance(full_prompt, str)
+        assert "prompt_token_ids" in engine_prompt
+        # With max_length=20 and truncation, should not exceed this
+        assert len(engine_prompt["prompt_token_ids"]) <= 20
+        # Since truncation was applied, token_ids should be a prefix of full encoding
+        full_ids = cross_encoder_tokenizer(full_prompt, add_special_tokens=True)[
+            "input_ids"
+        ]
+        actual_ids = engine_prompt["prompt_token_ids"]
+        assert full_ids[: len(actual_ids)] == actual_ids, (
+            f"Token IDs are not a prefix of full encoding.\n"
+            f"Full IDs:   {full_ids}\n"
+            f"Actual IDs: {actual_ids}"
+        )
+
+    def test_model_supports_score_template(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template (no score_template arg)."""
+        with patch(
+            "vllm.model_executor.model_loader.get_model_cls",
+            return_value=mock_model_with_score_template,
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query text",
+                "document text",
+            )
+
+        assert full_prompt == "[QUERY]query text[SEP][DOC]document text"
+        assert "prompt_token_ids" in engine_prompt
+        assert len(engine_prompt["prompt_token_ids"]) > 0
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_model_supports_score_template_but_custom_template_provided(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test when model supports score template but custom template is provided."""
+        template = (
+            'TEMPLATE_USED {{ messages[0]["content"] }} {{ messages[1]["content"] }}'
+        )
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+                score_template=template,  # Providing a template
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        assert full_prompt == "TEMPLATE_USED query doc"
+
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_not_using_default_template(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        # FIXME: For now, we only apply a template when one is explicitly provided.
+        # We cannot rely on the tokenizer's chat template because many models
+        # inherit junk templates from their base LLM, which breaks both the models
+        # and the tests that use them.
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                return_value="test querytest doc",
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "test query",
+                "test doc",
+            )
+
+        assert full_prompt == "test querytest doc"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_fallback_with_sep_token(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_sep_token=True."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,  # use_sep_token=True
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert "prompt_token_ids" in engine_prompt
+        # Should have token_type_ids from text_pair encoding
+        assert "token_type_ids" in engine_prompt
+        assert "query" in full_prompt
+        assert "document" in full_prompt
+        assert full_prompt != "querydocument"
+        assert (
+            engine_prompt["prompt_token_ids"]
+            == cross_encoder_tokenizer(
+                "query", text_pair="document", add_special_tokens=True
+            )["input_ids"]
+        )
+
+        # FIXME(?): add_special_tokens=False is needed because in this case
+        # full_prompt is obtained by decoding the tokenized prompt, which includes
+        # special tokens and we would get duplicated special tokens otherwise.
+        # This is inconsistent with other cases.
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer,
+            full_prompt,
+            engine_prompt,
+            add_special_tokens=False,
+        )
+
+    def test_fallback_without_sep_token(
+        self,
+        llm_reranker_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_no_score_template,
+    ):
+        """Test fallback path when ChatTemplateResolutionError
+        and use_sep_token=False."""
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_no_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                llm_reranker_model_config,  # use_sep_token=False
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "document",
+            )
+
+        assert full_prompt == "querydocument"
+        assert "prompt_token_ids" in engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
+
+    def test_post_process_tokens_called(
+        self,
+        cross_encoder_model_config,
+        cross_encoder_tokenizer,
+        tokenization_kwargs,
+        mock_model_with_score_template,
+    ):
+        """Test that post_process_tokens is called on the engine prompt."""
+        # Reset the call tracker
+        mock_model_with_score_template.post_process_called.clear()
+
+        with (
+            patch(
+                "vllm.model_executor.model_loader.get_model_cls",
+                return_value=mock_model_with_score_template,
+            ),
+            patch(
+                "vllm.entrypoints.score_utils.apply_hf_chat_template",
+                side_effect=ChatTemplateResolutionError("No template"),
+            ),
+        ):
+            full_prompt, engine_prompt = get_score_prompt(
+                cross_encoder_model_config,
+                cross_encoder_tokenizer,
+                tokenization_kwargs,
+                "query",
+                "doc",
+            )
+
+        # post_process_tokens should have been called once
+        assert len(mock_model_with_score_template.post_process_called) == 1
+        assert mock_model_with_score_template.post_process_called[0] is engine_prompt
+        assert_prompt_tokenization_consistent(
+            cross_encoder_tokenizer, full_prompt, engine_prompt
+        )
--- a/tests/v1/tpu/worker/__init__.py
+++ b/tests/v1/tpu/worker/__init__.py
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@@ -37,7 +37,7 @@ def server():
        "--max-num-seqs",
        "128",
        "--worker-extension-cls",
-        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+        "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
    ]
    with RemoteOpenAIServer(
        MODEL_NAME,

--- a/vllm/attention/backends/__init__.py
+++ b/vllm/attention/backends/__init__.py
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -5,7 +5,7 @@ import os
 import requests
 from prometheus_client.parser import text_string_to_metric_families

-from ...utils import RemoteOpenAIServer, models_path_prefix
+from tests.utils import RemoteOpenAIServer, models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")


--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -26,9 +26,9 @@ from vllm.entrypoints.chat_utils import (
 )
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
-    encode_audio_base64,
-    encode_image_base64,
-    encode_video_base64,
+    encode_audio_url,
+    encode_image_url,
+    encode_video_url,
 )
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
@@ -142,22 +142,19 @@ def mistral_model_config():
 @pytest.fixture(scope="module")
 def image_url():
    image = ImageAsset("cherry_blossom")
-    base64 = encode_image_base64(image.pil_image)
-    return f"data:image/jpeg;base64,{base64}"
+    return encode_image_url(image.pil_image)


 @pytest.fixture(scope="module")
 def video_url():
    video = VideoAsset("baby_reading", 1)
-    base64 = encode_video_base64(video.np_ndarrays)
-    return f"data:video/jpeg;base64,{base64}"
+    return encode_video_url(video.np_ndarrays)


 @pytest.fixture(scope="module")
 def audio_url():
    audio = AudioAsset("mary_had_lamb")
-    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
-    return f"data:audio/ogg;base64,{base64}"
+    return encode_audio_url(*audio.audio_and_sample_rate)


 def _assert_mm_data_is_image_input(

--- a/tests/entrypoints/test_grpc_server.py
+++ b/tests/entrypoints/test_grpc_server.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+End-to-end tests for the vLLM gRPC server.
+"""
+
+import asyncio
+import socket
+import subprocess
+import sys
+import time
+
+import grpc
+import pytest
+import pytest_asyncio
+
+from vllm.grpc import vllm_engine_pb2, vllm_engine_pb2_grpc
+
+# Use a small model for fast testing
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def find_free_port() -> int:
+    """Find a free port on localhost."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.listen(1)
+        port = s.getsockname()[1]
+    return port
+
+
+async def wait_for_server(port: int, timeout: float = 60.0) -> bool:
+    """Wait for the gRPC server to be ready by trying health checks."""
+    start_time = time.time()
+    print("waiting for server to start...")
+    while time.time() - start_time < timeout:
+        try:
+            channel = grpc.aio.insecure_channel(f"localhost:{port}")
+            stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
+            request = vllm_engine_pb2.HealthCheckRequest()
+            response = await stub.HealthCheck(request, timeout=5.0)
+            await channel.close()
+            if response.healthy:
+                print("server returned healthy=True")
+                return True
+        except Exception:
+            await asyncio.sleep(0.5)
+    return False
+
+
+class GrpcServerProcess:
+    """Manages a gRPC server running in a subprocess."""
+
+    def __init__(self):
+        self.process: subprocess.Popen | None = None
+        self.port: int | None = None
+
+    async def start(self):
+        """Start the gRPC server process."""
+        self.port = find_free_port()
+
+        # Start the server as a subprocess
+        self.process = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.grpc_server",
+                "--model",
+                MODEL_NAME,
+                "--host",
+                "localhost",
+                "--port",
+                str(self.port),
+                "--max-num-batched-tokens",
+                "512",
+                "--disable-log-stats-server",
+            ],
+        )
+
+        # Wait for server to be ready
+        if not await wait_for_server(self.port):
+            self.stop()
+            raise RuntimeError("gRPC server failed to start within timeout")
+
+    def stop(self):
+        """Stop the gRPC server process."""
+        if self.process:
+            self.process.terminate()
+            try:
+                self.process.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                self.process.kill()
+                self.process.wait()
+
+
+@pytest_asyncio.fixture(scope="module")
+async def grpc_server():
+    """Fixture providing a running gRPC server in a subprocess."""
+    server = GrpcServerProcess()
+    await server.start()
+
+    yield server
+
+    server.stop()
+
+
+@pytest_asyncio.fixture
+async def grpc_client(grpc_server):
+    """Fixture providing a gRPC client connected to the server."""
+    channel = grpc.aio.insecure_channel(f"localhost:{grpc_server.port}")
+    stub = vllm_engine_pb2_grpc.VllmEngineStub(channel)
+
+    yield stub
+
+    await channel.close()
+
+
+@pytest.mark.asyncio
+async def test_health_check(grpc_client):
+    """Test the HealthCheck RPC."""
+    request = vllm_engine_pb2.HealthCheckRequest()
+    response = await grpc_client.HealthCheck(request)
+
+    assert response.healthy is True
+    assert response.message == "Health"
+
+
+@pytest.mark.asyncio
+async def test_get_model_info(grpc_client):
+    """Test the GetModelInfo RPC."""
+    request = vllm_engine_pb2.GetModelInfoRequest()
+    response = await grpc_client.GetModelInfo(request)
+
+    assert response.model_path == MODEL_NAME
+    assert response.is_generation is True
+    assert response.max_context_length > 0
+    assert response.vocab_size > 0
+    assert response.supports_vision is False
+
+
+@pytest.mark.asyncio
+async def test_get_server_info(grpc_client):
+    """Test the GetServerInfo RPC."""
+    request = vllm_engine_pb2.GetServerInfoRequest()
+    response = await grpc_client.GetServerInfo(request)
+
+    assert response.active_requests >= 0
+    assert response.is_paused is False
+    assert response.uptime_seconds >= 0
+    assert response.server_type == "vllm-grpc"
+    assert response.last_receive_timestamp > 0
+
+
+@pytest.mark.asyncio
+async def test_generate_non_streaming(grpc_client):
+    """Test the Generate RPC in non-streaming mode."""
+    # Create a simple request
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-non-streaming-1",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello, my name is",
+            input_ids=[15496, 11, 616, 1438, 318],  # GPT-2 tokens for the prompt
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            max_tokens=10,
+            n=1,
+        ),
+        stream=False,
+    )
+
+    # Collect all responses
+    responses = []
+    async for response in grpc_client.Generate(request):
+        responses.append(response)
+
+    # Should have exactly one response (complete)
+    assert len(responses) == 1
+
+    # Check the response
+    final_response = responses[0]
+    assert final_response.HasField("complete")
+
+    complete = final_response.complete
+    assert len(complete.output_ids) > 0
+    assert complete.finish_reason in ["stop", "length"]
+    assert complete.prompt_tokens > 0
+    assert complete.completion_tokens > 0
+
+
+@pytest.mark.asyncio
+async def test_generate_streaming(grpc_client):
+    """Test the Generate RPC in streaming mode."""
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-streaming-1",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="The capital of France is",
+            input_ids=[464, 3139, 286, 4881, 318],  # GPT-2 tokens
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0, max_tokens=10, n=1
+        ),
+        stream=True,
+    )
+
+    # Collect all responses
+    chunks = []
+    complete_response = None
+
+    async for response in grpc_client.Generate(request):
+        if response.HasField("chunk"):
+            chunks.append(response.chunk)
+        elif response.HasField("complete"):
+            complete_response = response.complete
+
+    # Should have received some chunks
+    assert len(chunks) >= 0  # May have 0 chunks if generation is very fast
+
+    # Should have a final complete response
+    assert complete_response is not None
+    assert complete_response.finish_reason in ["stop", "length"]
+    assert complete_response.prompt_tokens > 0
+
+    # Verify chunk structure
+    for chunk in chunks:
+        assert chunk.prompt_tokens > 0
+        assert chunk.completion_tokens >= 0
+
+
+@pytest.mark.asyncio
+async def test_generate_with_different_sampling_params(grpc_client):
+    """Test Generate with various sampling parameters."""
+    # Test with temperature
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-sampling-temp",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.8, top_p=0.95, max_tokens=5
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+    # Test with top_k
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-sampling-topk",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=1.0, top_k=50, max_tokens=5
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+
+@pytest.mark.asyncio
+async def test_generate_with_stop_strings(grpc_client):
+    """Test Generate with stop strings."""
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-stop-strings",
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            max_tokens=20,
+            stop=["\n", "END"],
+        ),
+        stream=False,
+    )
+
+    responses = [r async for r in grpc_client.Generate(request)]
+    assert len(responses) == 1
+    assert responses[0].HasField("complete")
+
+    complete = responses[0].complete
+    assert complete.finish_reason in ["stop", "length"]
+
+
+@pytest.mark.asyncio
+async def test_generate_multiple_requests(grpc_client):
+    """Test handling multiple concurrent Generate requests."""
+
+    async def make_request(request_id: str):
+        request = vllm_engine_pb2.GenerateRequest(
+            request_id=request_id,
+            tokenized=vllm_engine_pb2.TokenizedInput(
+                original_text="Hello",
+                input_ids=[15496],
+            ),
+            sampling_params=vllm_engine_pb2.SamplingParams(
+                temperature=0.0, max_tokens=5
+            ),
+            stream=False,
+        )
+
+        responses = [r async for r in grpc_client.Generate(request)]
+        return responses[0]
+
+    # Send multiple requests concurrently
+    tasks = [make_request(f"test-concurrent-{i}") for i in range(3)]
+    responses = await asyncio.gather(*tasks)
+
+    # Verify all requests completed successfully
+    assert len(responses) == 3
+    for i, response in enumerate(responses):
+        assert response.HasField("complete")
+
+
+@pytest.mark.asyncio
+async def test_generate_with_seed(grpc_client):
+    """Test Generate with a fixed seed for reproducibility."""
+
+    def make_request(request_id: str, seed: int):
+        return vllm_engine_pb2.GenerateRequest(
+            request_id=request_id,
+            tokenized=vllm_engine_pb2.TokenizedInput(
+                original_text="The future of AI is",
+                input_ids=[464, 2003, 286, 9552, 318],
+            ),
+            sampling_params=vllm_engine_pb2.SamplingParams(
+                temperature=1.0, max_tokens=10, seed=seed
+            ),
+            stream=False,
+        )
+
+    # Make two requests with the same seed
+    request1 = make_request("test-seed-1", 42)
+    request2 = make_request("test-seed-2", 42)
+
+    response_list1 = [r async for r in grpc_client.Generate(request1)]
+    response_list2 = [r async for r in grpc_client.Generate(request2)]
+
+    # Both should complete successfully
+    assert len(response_list1) == 1
+    assert len(response_list2) == 1
+    assert response_list1[0].HasField("complete")
+    assert response_list2[0].HasField("complete")
+
+    # With the same seed, outputs should be identical
+    output_ids1 = list(response_list1[0].complete.output_ids)
+    output_ids2 = list(response_list2[0].complete.output_ids)
+    assert output_ids1 == output_ids2
+
+
+@pytest.mark.asyncio
+async def test_generate_error_handling(grpc_client):
+    """Test error handling in Generate RPC."""
+    # Request with invalid top_p value (-33)
+    request = vllm_engine_pb2.GenerateRequest(
+        request_id="test-error-invalid-topp",
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0, max_tokens=10, top_p=-33
+        ),
+        stream=False,
+    )
+
+    # Should raise an error response
+    with pytest.raises(grpc.RpcError) as exc_info:
+        _ = [r async for r in grpc_client.Generate(request)]
+
+    assert exc_info.value.code() == grpc.StatusCode.INVALID_ARGUMENT
+    assert "top_p must be in (0, 1], got -33.0" in exc_info.value.details()
+
+
+@pytest.mark.asyncio
+async def test_abort_request(grpc_client):
+    """Test the out-of-band Abort RPC."""
+    request_id = "test-abort-1"
+
+    # Start a long-running streaming generate request
+    generate_request = vllm_engine_pb2.GenerateRequest(
+        request_id=request_id,
+        tokenized=vllm_engine_pb2.TokenizedInput(
+            original_text="Hello",
+            input_ids=[15496],
+        ),
+        sampling_params=vllm_engine_pb2.SamplingParams(
+            temperature=0.0,
+            min_tokens=500,
+            max_tokens=500,  # Request many tokens to ensure it runs long enough
+        ),
+        stream=True,
+    )
+
+    # Track whether we were aborted
+    was_aborted = False
+    received_chunks = 0
+
+    async def run_generate():
+        nonlocal was_aborted, received_chunks
+        async for response in grpc_client.Generate(generate_request):
+            if response.HasField("chunk"):
+                received_chunks += 1
+
+            if response.HasField("complete"):
+                complete = response.complete
+                was_aborted = complete.finish_reason == "abort"
+            else:
+                was_aborted = False
+
+    async def abort_after_delay():
+        # Small delay to ensure generate has started
+        await asyncio.sleep(0.1)
+        abort_request = vllm_engine_pb2.AbortRequest(request_ids=[request_id])
+        await grpc_client.Abort(abort_request)
+
+    # Run generate and abort concurrently
+    await asyncio.gather(run_generate(), abort_after_delay())
+
+    # The request should have been aborted (received final chunk with
+    # "abort" finish reason) and finished early due to the abort.
+    assert was_aborted and received_chunks < 500, (
+        "Request should have been aborted before generating all 500 tokens"
+    )
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+from openai.types.chat import ChatCompletionMessageParam
 from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
@@ -14,8 +15,10 @@ from openai.types.responses.response_reasoning_item import (
    Summary,
 )

+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.responses_utils import (
    _construct_single_message_from_response_item,
+    _maybe_combine_reasoning_and_tool_call,
    construct_chat_messages_with_tool_call,
    convert_tool_responses_to_completions_format,
 )
@@ -160,3 +163,118 @@ class TestResponsesUtils:
        formatted_item = _construct_single_message_from_response_item(output_item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["content"] == "dongyi"
+
+
+class TestMaybeCombineReasoningAndToolCall:
+    """Tests for _maybe_combine_reasoning_and_tool_call function."""
+
+    def test_returns_none_when_item_id_is_none(self):
+        """
+        Test fix from PR #31999: when item.id is None, should return None
+        instead of raising TypeError on startswith().
+        """
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=None,  # This was causing TypeError before the fix
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages: list[ChatCompletionMessageParam] = []
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_id_does_not_start_with_mcp_prefix(self):
+        """Test that non-MCP tool calls are not combined."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id="regular_id",  # Does not start with MCP_PREFIX
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_last_message_is_not_assistant(self):
+        """Test that non-assistant last message returns None."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "user", "content": "hello"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_last_message_has_no_reasoning(self):
+        """Test that assistant message without reasoning returns None."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "content": "some content"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_combines_reasoning_and_mcp_tool_call(self):
+        """Test successful combination of reasoning message and MCP tool call."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id=f"{MCP_PREFIX}tool_id",
+            call_id="call_123",
+            name="test_function",
+            arguments='{"arg": "value"}',
+        )
+        messages = [{"role": "assistant", "reasoning": "I need to call this tool"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is not None
+        assert result["role"] == "assistant"
+        assert result["reasoning"] == "I need to call this tool"
+        assert "tool_calls" in result
+        assert len(result["tool_calls"]) == 1
+        assert result["tool_calls"][0]["id"] == "call_123"
+        assert result["tool_calls"][0]["function"]["name"] == "test_function"
+        assert result["tool_calls"][0]["function"]["arguments"] == '{"arg": "value"}'
+        assert result["tool_calls"][0]["type"] == "function"
+
+    def test_returns_none_for_non_function_tool_call_type(self):
+        """Test that non-ResponseFunctionToolCall items return None."""
+        # Pass a dict instead of ResponseFunctionToolCall
+        item = {"type": "message", "content": "hello"}
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
+
+    def test_returns_none_when_id_is_empty_string(self):
+        """Test that empty string id returns None (falsy check)."""
+        item = ResponseFunctionToolCall(
+            type="function_call",
+            id="",  # Empty string is falsy
+            call_id="call_123",
+            name="test_function",
+            arguments="{}",
+        )
+        messages = [{"role": "assistant", "reasoning": "some reasoning"}]
+
+        result = _maybe_combine_reasoning_and_tool_call(item, messages)
+
+        assert result is None
--- a/tests/entrypoints/test_utils.py
+++ b/tests/entrypoints/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.entrypoints.utils import sanitize_message
+
+
+def test_sanitize_message():
+    assert (
+        sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
+        == "<_io.BytesIO object>"
+    )
--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
 ### Run tests with pytest (like buildkite)

 ```bash
-pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
-    --config-list-file=configs/models-small.txt \
-    --tp-size=1
+pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt
 ```

 ### Run standalone evaluation script
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 accuracy_threshold: 0.54  # Minimum expected accuracy
 num_questions: 1319       # Number of questions (default: full test set)
 num_fewshot: 5            # Few-shot examples from train set
-max_model_len: 4096       # Model context length
+server_args: "--max-model-len 4096 --tensor-parallel-size 2"  # Server arguments
+env:                      # Environment variables (optional)
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
 ```
+
+The `server_args` field accepts any arguments that can be passed to `vllm serve`.
+
+The `env` field accepts a dictionary of environment variables to set for the server process.
--- a/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --data-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'
--- a/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml
+model_name: "deepseek-ai/DeepSeek-R1"
+accuracy_threshold: 0.95
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
+  --speculative-config '{"method":"mtp","num_speculative_tokens":1}'