Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · 99324e25 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -28,7 +28,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 class ParallelSetup(NamedTuple):
    tp_size: int
    pp_size: int
-    sp_enabled: bool
+    enable_fusion: bool
    eager_mode: bool
    chunked_prefill: bool
@@ -67,49 +67,18 @@ class SPTestSettings:
        task: TaskOption = "auto",
        load_format: Optional[str] = None,
    ):
+        parallel_setups = []
+        for eager_mode_val in [False, True]:
+            for pp_multiplier in [1, 2]:
+                for chunked_prefill_val in [False, True]:
+                    parallel_setups.append(
+                        ParallelSetup(tp_size=tp_base,
+                                      pp_size=pp_multiplier * pp_base,
+                                      enable_fusion=False,
+                                      eager_mode=eager_mode_val,
+                                      chunked_prefill=chunked_prefill_val))
        return SPTestSettings(
-            parallel_setups=[
+            parallel_setups=parallel_setups,
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              sp_enabled=True,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              sp_enabled=True,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              sp_enabled=True,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              sp_enabled=True,
-                              eager_mode=True,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              sp_enabled=True,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              sp_enabled=True,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              sp_enabled=True,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              sp_enabled=True,
-                              eager_mode=True,
-                              chunked_prefill=True)
-            ],
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
            task=task,
@@ -126,19 +95,44 @@ class SPTestSettings:
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
+        parallel_setups = []
+        for eager_mode_val in [False, True]:
+            for pp_multiplier in [1, 2]:
+                for chunked_prefill_val in [False, True]:
+                    parallel_setups.append(
+                        ParallelSetup(tp_size=tp_base,
+                                      pp_size=pp_multiplier * pp_base,
+                                      enable_fusion=False,
+                                      eager_mode=eager_mode_val,
+                                      chunked_prefill=chunked_prefill_val))
        return SPTestSettings(
-            parallel_setups=[
+            parallel_setups=parallel_setups,
+            distributed_backends=["mp", "ray"],
+            vllm_major_versions=["1", "1"],
+            task=task,
+            test_options=SPTestOptions(multi_node_only=multi_node_only,
+                                       load_format=load_format),
+        )
+    @staticmethod
+    def fp8_quant(
+        *,
+        tp_base: int = 2,
+        pp_base: int = 1,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
+        load_format: Optional[str] = None,
+    ):
+        parallel_setups = []
+        for fusion_val in [False, True]:
+            parallel_setups.append(
                ParallelSetup(tp_size=tp_base,
                              pp_size=pp_base,
-                              sp_enabled=True,
+                              enable_fusion=fusion_val,
-                              eager_mode=False,
+                              eager_mode=True,
-                              chunked_prefill=False),
+                              chunked_prefill=False))
-                ParallelSetup(tp_size=tp_base,
+        return SPTestSettings(
-                              pp_size=2 * pp_base,
+            parallel_setups=parallel_setups,
-                              sp_enabled=True,
-                              eager_mode=False,
-                              chunked_prefill=False),
-            ],
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
            task=task,
@@ -171,7 +165,7 @@ def _compare_sp(
    (
        tp_size,
        pp_size,
-        sp_enabled,
+        enable_fusion,
        eager_mode,
        chunked_prefill,
    ) = parallel_setup
@@ -240,9 +234,9 @@ def _compare_sp(
        'compile_sizes': [4, 8],
        'splitting_ops': [],
        'pass_config': {
-            'enable_sequence_parallelism': sp_enabled,
+            'enable_sequence_parallelism': True,
+            'enable_fusion': enable_fusion,
            'enable_noop': True,
-            'enable_fusion': True,
        },
    }
@@ -291,12 +285,14 @@ def _compare_sp(
 SP_TEXT_GENERATION_MODELS = {
    # [Decoder-only]
    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }
 SP_TEST_MODELS = [
    # TODO support other models
    # [LANGUAGE GENERATION]
    "meta-llama/Llama-3.2-1B-Instruct",
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
 ]

--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -231,6 +231,38 @@ def test_limit_mm_per_prompt_parser(arg, expected):
    assert args.limit_mm_per_prompt == expected
+@pytest.mark.parametrize(
+    ("arg", "expected"),
+    [
+        (None, dict()),
+        ('{"video": {"num_frames": 123} }', {
+            "video": {
+                "num_frames": 123
+            }
+        }),
+        (
+            '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
+            {
+                "video": {
+                    "num_frames": 123,
+                    "fps": 1.0,
+                    "foo": "bar"
+                },
+                "image": {
+                    "foo": "bar"
+                }
+            }),
+    ])
+def test_media_io_kwargs_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--media-io-kwargs", arg])
+    assert args.media_io_kwargs == expected
 def test_compilation_config():
    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
@@ -239,32 +271,40 @@ def test_compilation_config():
    assert args.compilation_config == CompilationConfig()
    # set to O3
-    args = parser.parse_args(["-O3"])
+    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.level == 3
+    assert args.compilation_config.level == 0
    # set to O 3 (space)
-    args = parser.parse_args(["-O", "3"])
+    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.level == 3
+    assert args.compilation_config.level == 1
    # set to O 3 (equals)
-    args = parser.parse_args(["-O=3"])
+    args = parser.parse_args(["-O=2"])
+    assert args.compilation_config.level == 2
+    # set to O.level 3
+    args = parser.parse_args(["-O.level", "3"])
    assert args.compilation_config.level == 3
    # set to string form of a dict
    args = parser.parse_args([
-        "--compilation-config",
+        "-O",
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}',
+        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+        '"use_inductor": false}',
    ])
    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+            and not args.compilation_config.use_inductor)
    # set to string form of a dict
    args = parser.parse_args([
        "--compilation-config="
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}',
+        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+        '"use_inductor": true}',
    ])
    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8])
+            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
+            and args.compilation_config.use_inductor)
 def test_prefix_cache_default():

--- a/tests/engine/test_options.py
+++ b/tests/engine/test_options.py
@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
        ValueError, match="set `--enable-prompt-embeds`"))
-    # This test checks if the flag skip_tokenizer_init skips the initialization
-    # of tokenizer and detokenizer. The generated output is expected to contain
-    # token ids.
    llm = LLM(
        model=model,
        enable_prompt_embeds=enable_prompt_embeds,

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -8,6 +8,8 @@ import pytest
 from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
+from ...models.utils import check_embeddings_close
 MODEL_NAME = "intfloat/multilingual-e5-small"
 PROMPTS = [
@@ -27,6 +29,14 @@ TOKEN_IDS = [
 ]
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 @pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
@@ -46,9 +56,15 @@ def llm():
    cleanup_dist_env_and_memory()
-def assert_outputs_equal(o1: list[PoolingRequestOutput],
+def assert_outputs_match(o1: list[PoolingRequestOutput],
                         o2: list[PoolingRequestOutput]):
-    assert [o.outputs for o in o1] == [o.outputs for o in o2]
+    check_embeddings_close(
+        embeddings_0_lst=[o.outputs.data for o in o1],
+        embeddings_1_lst=[o.outputs.data for o in o2],
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
 @pytest.mark.skip_global_cleanup
@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
    v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
                           pooling_params=pooling_params)
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup
@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
        } for p in TOKEN_IDS],
        pooling_params=pooling_params,
    )
-    assert_outputs_equal(v1_output, v2_output)
+    assert_outputs_match(v1_output, v2_output)
 @pytest.mark.skip_global_cleanup

--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -125,4 +125,7 @@ def test_max_model_len():
    for output in outputs:
        num_total_tokens = len(output.prompt_token_ids) + len(
            output.outputs[0].token_ids)
-        assert num_total_tokens == max_model_len
+        # Total tokens must not exceed max_model_len.
+        # It can be less if generation finishes due to other reasons (e.g., EOS)
+        # before reaching the absolute model length limit.
+        assert num_total_tokens <= max_model_len
--- a/tests/entrypoints/openai/correctness/test_mteb.py
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -7,34 +7,30 @@ import pytest
 from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
                                                      MTEB_EMBED_TOL,
                                                      OpenAIClientMtebEncoder,
-                                                      run_mteb_embed_task,
+                                                      run_mteb_embed_task)
-                                                      run_mteb_embed_task_st)
 from tests.utils import RemoteOpenAIServer
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
-MODEL_NAME = "BAAI/bge-m3"
+MODEL_NAME = "intfloat/e5-small"
-DTYPE = "float16"
+MAIN_SCORE = 0.7422994752439667
-MAIN_SCORE = 0.7873427091972599
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task", "embed", "--dtype", DTYPE, "--enforce-eager",
+        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
-        "--max-model-len", "512"
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
-def test_mteb(server):
+def test_mteb_embed(server):
    client = server.get_client()
    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
-    st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
+    st_main_score = MAIN_SCORE
-        MODEL_NAME, MTEB_EMBED_TASKS)
    print("VLLM main score: ", vllm_main_score)
    print("SentenceTransformer main score: ", st_main_score)

--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import pytest
+# yapf conflicts with isort for this block
+# yapf: disable
+from tests.models.language.pooling.mteb_utils import (
+    MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
+    RerankClientMtebEncoder, ScoreClientMtebEncoder,
+    mteb_test_rerank_models_hf, run_mteb_rerank)
+# yapf: enable
+from tests.utils import RemoteOpenAIServer
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.fixture(scope="module")
+def st_main_score(hf_runner):
+    # The main score related to the version of the dependency.
+    # So we need to recalculate every time.
+    main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
+    return main_score
+def test_mteb_score(server, st_main_score):
+    url = server.url_for("score")
+    encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
+                                      MTEB_RERANK_LANGS)
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
+def test_mteb_rerank(server, st_main_score):
+    url = server.url_for("rerank")
+    encoder = RerankClientMtebEncoder(MODEL_NAME, url)
+    vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
+                                      MTEB_RERANK_LANGS)
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
-MODEL_TEMPLATE_GENERATON_OUTPUT = [
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
 @pytest.mark.parametrize(
    "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATON_OUTPUT)
+    MODEL_TEMPLATE_GENERATION_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
                        continue_final_message, expected_output):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -779,3 +779,57 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
            prompt="Give an example string that fits this regex",
            extra_body=dict(guided_regex=sample_regex,
                            guided_json=sample_json_schema))
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,stream,echo",
+    [
+        (MODEL_NAME, False, False),
+        (MODEL_NAME, False, True),
+        (MODEL_NAME, True, False),
+        (MODEL_NAME, True, True)  # should not raise BadRequestError error
+    ],
+)
+async def test_echo_stream_completion(client: openai.AsyncOpenAI,
+                                      model_name: str, stream: bool,
+                                      echo: bool):
+    saying: str = "Hello, my name is"
+    result = await client.completions.create(model=model_name,
+                                             prompt=saying,
+                                             max_tokens=10,
+                                             temperature=0.0,
+                                             echo=echo,
+                                             stream=stream)
+    stop_reason = "length"
+    if not stream:
+        completion = result
+        assert completion.id is not None
+        assert completion.choices is not None and len(completion.choices) == 1
+        choice = completion.choices[0]
+        assert len(choice.text) >= 5
+        assert choice.finish_reason == stop_reason
+        if echo:
+            assert choice.text is not None and saying in choice.text
+        else:
+            assert choice.text is not None and saying not in choice.text
+    else:
+        chunks: list[str] = []
+        final_finish_reason = None
+        async for chunk in result:
+            if chunk.choices and chunk.choices[0].text:
+                chunks.append(chunk.choices[0].text)
+            if chunk.choices and chunk.choices[0].finish_reason:
+                final_finish_reason = chunk.choices[0].finish_reason
+        assert final_finish_reason == stop_reason
+        content = "".join(chunks)
+        if echo:
+            assert content is not None and saying in content
+        else:
+            assert content is not None and saying not in content
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -21,6 +21,14 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 DTYPE = "bfloat16"
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 @pytest.fixture(scope="module")
 def server():
    args = [

--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for middleware that's off by default and can be toggled through
+server arguments, mainly --api-key and --enable-request-id-headers.
+"""
+from http import HTTPStatus
+import pytest
+import requests
+from ...utils import RemoteOpenAIServer
+# Use a small embeddings model for faster startup and smaller memory footprint.
+# Since we are not testing any chat functionality,
+# using a chat capable model is overkill.
+MODEL_NAME = "intfloat/multilingual-e5-small"
+@pytest.fixture(scope="module")
+def server(request: pytest.FixtureRequest):
+    passed_params = []
+    if hasattr(request, "param"):
+        passed_params = request.param
+    if isinstance(passed_params, str):
+        passed_params = [passed_params]
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "512",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "2",
+        *passed_params
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.mark.asyncio
+async def test_no_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.OK
+@pytest.mark.asyncio
+async def test_no_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" not in response.headers
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_missing_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.UNAUTHORIZED
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_passed_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"),
+                            headers={"Authorization": "Bearer test"})
+    assert response.status_code == HTTPStatus.OK
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_not_v1_api_token(server: RemoteOpenAIServer):
+    # Authorization check is skipped for any paths that
+    # don't start with /v1 (e.g. /v1/chat/completions).
+    response = requests.get(server.url_for("health"))
+    assert response.status_code == HTTPStatus.OK
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_enable_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" in response.headers
+    assert len(response.headers.get("X-Request-Id", "")) == 32
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_custom_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"),
+                            headers={"X-Request-Id": "Custom"})
+    assert "X-Request-Id" in response.headers
+    assert response.headers.get("X-Request-Id") == "Custom"
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -7,6 +7,7 @@ import numpy as np
 import pytest
 import requests
+from tests.models.utils import check_embeddings_close
 from vllm.entrypoints.openai.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
            np.frombuffer(base64.b64decode(data.data),
                          dtype="float32").tolist())
-    assert responses_float.data[0].data == decoded_responses_base64_data[0]
+    check_embeddings_close(
-    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+        embeddings_0_lst=[d.data for d in responses_float.data],
+        embeddings_1_lst=decoded_responses_base64_data,
+        name_0="float32",
+        name_1="base64")
    # Default response is float32 decoded from base64 by OpenAI Client
    default_response = requests.post(
@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
    default_response.raise_for_status()
    responses_default = PoolingResponse.model_validate(default_response.json())
-    assert responses_float.data[0].data == responses_default.data[0].data
+    check_embeddings_close(
-    assert responses_float.data[1].data == responses_default.data[1].data
+        embeddings_0_lst=[d.data for d in responses_default.data],
+        embeddings_1_lst=[d.data for d in responses_default.data],
+        name_0="float32",
+        name_1="base64")
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -12,6 +12,14 @@ MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 @pytest.fixture(scope="module")
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -11,6 +11,15 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
 from ...utils import RemoteOpenAIServer
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
 MODELS = [
    {
        "name": "BAAI/bge-reranker-v2-m3",

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -3,8 +3,8 @@
 import asyncio
 from contextlib import suppress
-from dataclasses import dataclass
+from dataclasses import dataclass, field
-from typing import Optional
+from typing import Any, Optional
 from unittest.mock import MagicMock
 from vllm.config import MultiModalConfig
@@ -40,6 +40,7 @@ class MockModelConfig:
    allowed_local_media_path: str = ""
    encoder_config = None
    generation_config: str = "auto"
+    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}

--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
    model_name = "openai/whisper-large-v3-turbo"
    server_args = ["--enforce-eager"]
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    prompt = "THE FIRST WORDS I SPOKE"
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
            temperature=0.0)
        out = json.loads(transcription)['text']
        assert "Mary had a little lamb," in out
-        # This should "force" whisper to continue prompt in all caps
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_capital = json.loads(transcription_wprompt)['text']
-        assert prompt not in out_capital
 @pytest.mark.asyncio
@@ -74,19 +63,31 @@ async def test_bad_requests(mary_had_lamb):
                                                     language="hh",
                                                     temperature=0.0)
-        # Expect audio too long: repeat the timeseries
-        mary_had_lamb.seek(0)
+@pytest.mark.asyncio
-        audio, sr = librosa.load(mary_had_lamb)
+async def test_long_audio_request(mary_had_lamb):
-        repeated_audio = np.tile(audio, 10)
+    model_name = "openai/whisper-large-v3-turbo"
-        # Repeated audio to buffer
+    server_args = ["--enforce-eager"]
-        buffer = io.BytesIO()
-        sf.write(buffer, repeated_audio, sr, format='WAV')
+    mary_had_lamb.seek(0)
-        buffer.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
-        with pytest.raises(openai.BadRequestError):
+    # Add small silence after each audio for repeatability in the split process
-            await client.audio.transcriptions.create(model=model_name,
+    audio = np.pad(audio, (0, 1600))
-                                                     file=buffer,
+    repeated_audio = np.tile(audio, 10)
-                                                     language="en",
+    # Repeated audio to buffer
-                                                     temperature=0.0)
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=buffer,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert out.count("Mary had a little lamb") == 10
 @pytest.mark.asyncio
@@ -226,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
            extra_body=dict(seed=42))
        assert greedy_transcription.text != transcription.text
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    prompt = "This is a speech, recorded in a phonograph."
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        #Prompts should not omit the part of original prompt while transcribing.
+        prefix = "The first words I spoke in the original phonograph"
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert prefix in out
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_prompt = json.loads(transcription_wprompt)['text']
+        assert prefix in out_prompt
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import io
+# imports for guided decoding tests
+import json
+from unittest.mock import patch
+import librosa
+import numpy as np
+import pytest
+import soundfile as sf
+from openai._base_client import AsyncAPIClient
+from vllm.assets.audio import AudioAsset
+from ...utils import RemoteOpenAIServer
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="text",
+            # TODO remove once language detection is implemented
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip()
+        assert "Nor will I ever touch the sacred" in out
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            prompt=prompt,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Nor will I ever touch the sacred" not in out
+        assert prompt not in out
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.translations.create(model=model_name,
+                                                     file=foscolo,
+                                                     temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Translations API"
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    translation = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="json",
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the translation api.
+        original_post = AsyncAPIClient.post
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(model=model_name,
+                                                         file=foscolo,
+                                                         temperature=0.0,
+                                                         extra_body=dict(
+                                                             stream=True,
+                                                             language="it"))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                translation += text
+        assert translation == res_no_stream.text
+@pytest.mark.asyncio
+async def test_stream_options(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(
+                model=model_name,
+                file=foscolo,
+                temperature=0.0,
+                extra_body=dict(language="it",
+                                stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=buffer,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip().lower()
+        # TODO investigate higher model uncertainty in for longer translations.
+        assert out.count("nor will i ever") == 2
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -50,7 +50,7 @@ async def client(server):
 @pytest.fixture(scope="session")
 def base64_encoded_video() -> dict[str, str]:
    return {
-        video_url: encode_video_base64(fetch_video(video_url))
+        video_url: encode_video_base64(fetch_video(video_url)[0])
        for video_url in TEST_VIDEO_URLS
    }

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -25,6 +25,25 @@ TEST_IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]
+EXPECTED_MM_BEAM_SEARCH_RES = [
+    [
+        "The image shows a wooden boardwalk leading through a",
+        "The image shows a wooden boardwalk extending into a",
+    ],
+    [
+        "The image shows two parrots perched on",
+        "The image shows two birds perched on a cur",
+    ],
+    [
+        "The image shows a Venn diagram with three over",
+        "This image shows a Venn diagram with three over",
+    ],
+    [
+        "This image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors transitioning from",
+    ],
+]
 @pytest.fixture(scope="module")
 def server():
@@ -270,10 +289,13 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
 async def test_single_chat_session_image_base64encoded_beamsearch(
-        client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        client: openai.AsyncOpenAI, model_name: str, image_idx: int,
        base64_encoded_image: dict[str, str]):
+    # NOTE: This test also validates that we pass MM data through beam search
+    image_url = TEST_IMAGE_URLS[image_idx]
+    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
    messages = [{
        "role":
@@ -297,10 +319,11 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
        messages=messages,
        n=2,
        max_completion_tokens=10,
+        temperature=0.0,
        extra_body=dict(use_beam_search=True))
    assert len(chat_completion.choices) == 2
-    assert chat_completion.choices[
+    for actual, expected_str in zip(chat_completion.choices, expected_res):
-        0].message.content != chat_completion.choices[1].message.content
+        assert actual.message.content == expected_str
 @pytest.mark.asyncio

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
+                "type": "image_pil",
-                "image_url": {
+                "image_pil": ImageAsset('cherry_blossom').pil_image
-                    "url": image_url
-                }
            }, {
                "type": "text",
                "text": "What's in these images?"
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                    "url": image_url
                }
            }, {
-                "type": "image_url",
+                "type": "image_pil",
-                "image_url": {
+                "image_pil": ImageAsset('cherry_blossom').pil_image
-                    "url": image_url
-                }
            }, {
                "type": "text",
                "text": "What's in these images?"