Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "openai/whisper-large-v3-turbo"
@@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]


-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
 @pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
        assert out_usage["seconds"] == 16, out_usage["seconds"]


+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
 @pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
    # text to text model

--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -12,32 +12,24 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "openai/whisper-small"
 SERVER_ARGS = ["--enforce-eager"]


-@pytest.fixture
-def foscolo():
-    # Test translation it->en
-    path = AudioAsset('azacinto_foscolo').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
+@pytest.fixture(scope="module",
+                params=["openai/whisper-small", "google/gemma-3n-E2B-it"])
+def server(request):
+    # Parametrize over model name
+    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+        yield remote_server, request.param


 @pytest_asyncio.fixture
-async def client(server):
+async def client_and_model(server):
+    server, model_name = server
    async with server.get_async_client() as async_client:
-        yield async_client
+        yield async_client, model_name


 @pytest.mark.asyncio
@@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo):

 # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
 @pytest.mark.asyncio
-async def test_basic_audio(foscolo, client):
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="text",
-        # TODO remove once language detection is implemented
-        extra_body=dict(language="it"),
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()
    assert "greek sea" in out


 @pytest.mark.asyncio
-async def test_audio_prompt(foscolo, client):
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
    # Condition whisper on starting text
    prompt = "Nor have I ever"
    transcription = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        prompt=prompt,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(transcription)['text']
@@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client):


 @pytest.mark.asyncio
-async def test_streaming_response(foscolo, client, server):
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
    translation = ""
    res_no_stream = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=foscolo,
        response_format="json",
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en", seed=42),
        temperature=0.0)
+
    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "temperature": 0.0,
+        "seed": 42,
    }
    foscolo.seek(0)
    async with httpx.AsyncClient() as http_client:
@@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server):
                text = chunk["choices"][0].get("delta", {}).get("content")
                translation += text or ""

-    assert translation == res_no_stream.text
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert sum([
+        x == y for x, y in zip(res_stream, res_no_stream.text.split())
+    ]) >= len(res_stream) * 0.9


 @pytest.mark.asyncio
-async def test_stream_options(foscolo, client, server):
+async def test_stream_options(foscolo, server):
+    server, model_name = server
    url = server.url_for("v1/audio/translations")
    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
    data = {
-        "model": MODEL_NAME,
+        "model": model_name,
        "language": "it",
+        "to_language": "en",
        "stream": True,
        "stream_include_usage": True,
        "stream_continuous_usage_stats": True,
@@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server):


 @pytest.mark.asyncio
-async def test_long_audio_request(foscolo, client):
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
    foscolo.seek(0)
    audio, sr = librosa.load(foscolo)
    repeated_audio = np.tile(audio, 2)
@@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client):
    sf.write(buffer, repeated_audio, sr, format='WAV')
    buffer.seek(0)
    translation = await client.audio.translations.create(
-        model=MODEL_NAME,
+        model=model_name,
        file=buffer,
-        extra_body=dict(language="it"),
+        extra_body=dict(language="it", to_language="en"),
        response_format="text",
        temperature=0.0)
    out = json.loads(translation)['text'].strip().lower()

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
 MAXIMUM_IMAGES = 2

 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]

 EXPECTED_MM_BEAM_SEARCH_RES = [
@@ -69,10 +69,11 @@ async def client(server):


 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
    }


@@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                         model_name: str, image_url: str):
    content_text = "What's in this image?"
@@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
                                               model_name: str,
                                               image_url: str):
@@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
                                                    model_name: str,
                                                    image_url: str):
@@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_single_chat_session_image_base64encoded(
-        client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: dict[str, str]):
+        client: openai.AsyncOpenAI, model_name: str, raw_image_url: str,
+        image_url: str, base64_encoded_image: dict[str, str]):

    content_text = "What's in this image?"
    messages = [{
@@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded(
                "type": "image_url",
                "image_url": {
                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                }
            },
            {
@@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded(

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
        client: openai.AsyncOpenAI, model_name: str, image_idx: int,
        base64_encoded_image: dict[str, str]):
    # NOTE: This test also validates that we pass MM data through beam search
-    image_url = TEST_IMAGE_URLS[image_idx]
+    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]

    messages = [{
@@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
                "type": "image_url",
                "image_url": {
                    "url":
-                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                    f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
                }
            },
            {
@@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(

 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_chat_streaming_image(client: openai.AsyncOpenAI,
                                    model_name: str, image_url: str):
    messages = [{
@@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize(
    "image_urls",
-    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
                                 image_urls: list[str]):

@@ -433,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
        )
        message = chat_completion.choices[0].message
        assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            }
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
--- a/tests/entrypoints/pooling/__init__.py
+++ b/tests/entrypoints/pooling/__init__.py
--- a/tests/entrypoints/pooling/correctness/__init__.py
+++ b/tests/entrypoints/pooling/correctness/__init__.py
--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -4,10 +4,9 @@ import os

 import pytest

-from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
-                                                      MTEB_EMBED_TOL,
-                                                      OpenAIClientMtebEncoder,
-                                                      run_mteb_embed_task)
+from tests.models.language.pooling_mteb_test.mteb_utils import (
+    MTEB_EMBED_TASKS, MTEB_EMBED_TOL, OpenAIClientMtebEncoder,
+    run_mteb_embed_task)
 from tests.utils import RemoteOpenAIServer

 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
@@ -37,4 +36,6 @@ def test_mteb_embed(server):
    print("SentenceTransformer main score: ", st_main_score)
    print("Difference: ", st_main_score - vllm_main_score)

-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_EMBED_TOL
--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -4,18 +4,15 @@ import os

 import pytest

-# yapf conflicts with isort for this block
-# yapf: disable
-from tests.models.language.pooling.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_utils import (
    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
-    RerankClientMtebEncoder, ScoreClientMtebEncoder,
-    mteb_test_rerank_models_hf, run_mteb_rerank)
-# yapf: enable
+    RerankClientMtebEncoder, ScoreClientMtebEncoder, run_mteb_rerank)
 from tests.utils import RemoteOpenAIServer

 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+st_main_score = 0.33457


 @pytest.fixture(scope="module")
@@ -29,15 +26,7 @@ def server():
        yield remote_server


-@pytest.fixture(scope="module")
-def st_main_score(hf_runner):
-    # The main score related to the version of the dependency.
-    # So we need to recalculate every time.
-    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
-    return main_score
-
-
-def test_mteb_score(server, st_main_score):
+def test_mteb_score(server):
    url = server.url_for("score")
    encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -47,10 +36,12 @@ def test_mteb_score(server, st_main_score):
    print("SentenceTransformer main score: ", st_main_score)
    print("Difference: ", st_main_score - vllm_main_score)

-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL


-def test_mteb_rerank(server, st_main_score):
+def test_mteb_rerank(server):
    url = server.url_for("rerank")
    encoder = RerankClientMtebEncoder(MODEL_NAME, url)
    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
@@ -60,4 +51,6 @@ def test_mteb_rerank(server, st_main_score):
    print("SentenceTransformer main score: ", st_main_score)
    print("Difference: ", st_main_score - vllm_main_score)

-    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < MTEB_RERANK_TOL
--- a/tests/entrypoints/pooling/llm/__init__.py
+++ b/tests/entrypoints/pooling/llm/__init__.py
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch

+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory

-from ...models.utils import softmax
-
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"

 prompts = ["The chef prepared a delicious meal."]

--- a/tests/entrypoints/llm/test_embedding.py
+++ b/tests/entrypoints/llm/test_embedding.py
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch

+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory

-from ...models.utils import softmax
-
 MODEL_NAME = "internlm/internlm2-1_8b-reward"

 prompts = ["The chef prepared a delicious meal."]

--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@@ -6,11 +6,10 @@ import weakref
 import pytest
 import torch

+from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory

-from ...models.utils import softmax
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"



--- a/tests/entrypoints/pooling/openai/__init__.py
+++ b/tests/entrypoints/pooling/openai/__init__.py
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -6,10 +6,9 @@ import requests
 import torch
 import torch.nn.functional as F

+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import ClassificationResponse

-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue


--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,14 +11,13 @@ import requests
 import torch
 import torch.nn.functional as F

+from tests.models.language.pooling.embed_utils import (
+    run_embedding_correctness_test)
+from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer

-from ...models.language.pooling.embed_utils import (
-    run_embedding_correctness_test)
-from ...models.utils import check_embeddings_close
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"

--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -9,13 +9,12 @@ from typing import Optional
 import openai
 import pytest

-from vllm.entrypoints.openai.protocol import EmbeddingResponse
-
-from ...conftest import HfRunner
-from ...models.language.pooling.embed_utils import (
+from tests.conftest import HfRunner
+from tests.models.language.pooling.embed_utils import (
    run_embedding_correctness_test)
-from ...models.utils import EmbedModelInfo
-from ...utils import RemoteOpenAIServer
+from tests.models.utils import EmbedModelInfo
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.openai.protocol import EmbeddingResponse

 MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),

--- a/tests/entrypoints/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/openai/test_embedding_long_text.py
@@ -14,10 +14,9 @@ import openai
 import pytest
 import pytest_asyncio

+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import EmbeddingResponse

-from ...utils import RemoteOpenAIServer
-

 def _generate_random_text(word_count: int) -> str:
    """Generate random text with approximately the specified word count."""

--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -8,11 +8,10 @@ import pytest
 import requests

 from tests.models.utils import check_embeddings_close
+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer

-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "internlm/internlm2-1_8b-reward"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501


--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -6,10 +6,9 @@ import requests
 import torch
 import torch.nn.functional as F

+from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import RerankResponse

-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"