Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -7,11 +7,10 @@ import openai
 import pytest
 import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
 from vllm.assets.audio import AudioAsset
 from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
-from ...utils import RemoteOpenAIServer
 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 TEST_AUDIO_URLS = [
    AudioAsset("winning_call").url,

--- a/tests/entrypoints/openai/test_audio_in_video.py
+++ b/tests/entrypoints/openai/test_audio_in_video.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
 import json
 import openai
+import pybase64 as base64
 import pytest
 import pytest_asyncio
-from ...conftest import VideoTestAssets
+from tests.conftest import VideoTestAssets
-from ...utils import RemoteOpenAIServer
+from tests.utils import ROCM_EXTRA_ARGS, RemoteOpenAIServer
 MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
@@ -22,6 +22,7 @@ def server():
        "--enforce-eager",
        "--limit-mm-per-prompt",
        json.dumps({"audio": 3, "video": 3}),
+        *ROCM_EXTRA_ARGS,
    ]
    with RemoteOpenAIServer(

--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
--- a/tests/entrypoints/openai/chat_completion/test_chat_error.py
+++ b/tests/entrypoints/openai/chat_completion/test_chat_error.py
@@ -111,7 +111,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
            [{"prompt_token_ids": [1, 2, 3]}],
        )
-    serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
+    serving_chat.openai_serving_render.preprocess_chat = AsyncMock(
        side_effect=_fake_preprocess_chat
    )
    return serving_chat

--- a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
@@ -231,13 +231,14 @@ def k2_server():
        "--gpu-memory-utilization",
        "0.4",
    ] + ROCM_EXTRA_ARGS
-    # hack to test kimi_k2 tool use tool_id format.
+    # Test kimi_k2 tool use tool_id format by overriding model_type.
-    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
+    # is_deepseek_mla safely returns False via getattr when kv_lora_rank
+    # is absent from the underlying config.
    with RemoteOpenAIServer(
        MODEL_NAME,
        args,
        env_dict=ROCM_ENV_OVERRIDES,
-        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
+        override_hf_configs={"model_type": "kimi_k2"},
    ) as remote_server:
        yield remote_server

--- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
+++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -8,8 +8,8 @@ import pytest
 import pytest_asyncio
 from huggingface_hub import snapshot_download
-from ...conftest import AudioTestAssets
+from tests.conftest import AudioTestAssets
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 # NOTE - the tests in this module are currently analogous to test_chat, but are
 # separated to avoid OOM killing due to module-scoped servers, since we

--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
 chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()

--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -8,7 +8,7 @@ from typing import Any, NamedTuple
 import openai  # use the official client for correctness check
 import pytest
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 # # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -484,7 +484,7 @@ class TestGPTOSSSpeculativeChat:
        )
        content = ""
-        reasoning_content = ""
+        reasoning = ""
        async for chunk in stream:
            delta = chunk.choices[0].delta
            if delta.content:
@@ -492,9 +492,9 @@ class TestGPTOSSSpeculativeChat:
            chunk_reasoning = getattr(delta, "reasoning", None)
            if chunk_reasoning:
-                reasoning_content += delta.reasoning
+                reasoning += delta.reasoning
-        assert len(reasoning_content) > 0, "No reasoning was generated."
+        assert len(reasoning) > 0, "No reasoning was generated."
        assert content.strip() == "4"

--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -7,11 +7,10 @@ import openai
 import pytest
 import pytest_asyncio
+from tests.utils import RemoteOpenAIServer
 from vllm.multimodal.utils import encode_video_url, fetch_video
 from vllm.platforms import current_platform
-from ...utils import RemoteOpenAIServer
 MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 MAXIMUM_VIDEOS = 3

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -8,12 +8,11 @@ import pytest
 import pytest_asyncio
 from transformers import AutoProcessor
+from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 from vllm.multimodal.media import MediaWithBytes
 from vllm.multimodal.utils import encode_image_url, fetch_image
 from vllm.platforms import current_platform
-from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
 MAXIMUM_IMAGES = 2

--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
 import numpy as np
+import pybase64 as base64
 import pytest
 import requests
 import torch
+from tests.utils import RemoteOpenAIServer
 from vllm.utils.serial_utils import tensor2base64
-from ...utils import RemoteOpenAIServer
 @pytest.mark.parametrize(
    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]

--- a/tests/entrypoints/instrumentator/__init__.py
+++ b/tests/entrypoints/instrumentator/__init__.py
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -26,19 +26,12 @@ def default_server_args():
        "128",
        "--enforce-eager",
        "--enable-prompt-tokens-details",
+        "--no-enable-prefix-caching",
    ]
-@pytest.fixture(
+@pytest.fixture(scope="module")
-    scope="module",
+def server(default_server_args):
-    params=[
-        ["--no-enable-prefix-caching"],
-        ["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
-    ],
-)
-def server(default_server_args, request):
-    if request.param:
-        default_server_args = default_server_args + request.param
    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
 import io
 import json
 import openai  # use the official client for correctness check
+import pybase64 as base64
 import pytest
 import pytest_asyncio
 import torch
@@ -14,7 +14,7 @@ import torch
 from openai import BadRequestError
 from transformers import AutoConfig
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
    return [_encode_embeds(item) for item in example_embeddings]
-@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
+@pytest.fixture(scope="module")
-def server_with_prompt_embeds(default_server_args, request):
+def server_with_prompt_embeds(default_server_args):
-    if request.param:
-        default_server_args.append(request.param)
    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -11,11 +11,10 @@ import pytest
 import regex as re
 import torch
+from tests.utils import RemoteOpenAIServer
 from vllm.config import ModelConfig
 from vllm.renderers.embed_utils import safe_load_prompt_embeds
-from ...utils import RemoteOpenAIServer
 @pytest.mark.asyncio
 async def test_empty_prompt():

--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
            "0.05",
            "--max-num-seqs",
            "2",
-            "--disable-frontend-multiprocessing",
        ],
        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
        # stdout/stderr pipes are enabled during ROCm GPU initialization.