Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -5,7 +5,6 @@ import os

 from vllm import LLM
 from ...utils import models_path_prefix
-from vllm.config import LoadFormat


 @pytest.fixture(autouse=True)
@@ -17,17 +16,13 @@ def v1(run_with_both_engines):


 def test_empty_prompt():
-    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"), 
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])


 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
    with pytest.raises(ValueError, match='out of vocabulary'):
        llm.generate({"prompt_token_ids": [999999]})
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -56,32 +56,37 @@ def cache_models():

 @pytest.mark.skip_global_cleanup
 @pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
    # Set HF to offline mode and ensure we can still construct an LLM
-    try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
-
-        def disable_connect(*args, **kwargs):
-            raise RuntimeError("No http calls allowed")
-
-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
-                            disable_connect)
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
-                            disable_connect)
-
-        # Need to re-import huggingface_hub and friends to setup offline mode
-        _re_import_modules()
-        # Cached model files should be used in offline mode
-        for model_config in MODEL_CONFIGS:
-            LLM(**model_config)
-    finally:
-        # Reset the environment after the test
-        # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("HF_HUB_OFFLINE")
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
-        _re_import_modules()
-        pass
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()


 def _re_import_modules():

--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUE = 0.54
 DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
 MORE_ARGS_LIST = [
    [],  # Default
@@ -71,7 +71,7 @@ def run_test(more_args):
 @pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):


 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:

--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -10,7 +10,6 @@ import asyncio
 import io
 import time
 from statistics import mean, median
-from typing import List

 import librosa
 import pytest
@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
    _ = await bound_transcribe(model, sem, client, (audio, sr), "")

-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
    for sample in data:
        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
        task = asyncio.create_task(

--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest
 from transformers import AutoTokenizer

@@ -180,7 +178,7 @@ def test_reasoning(
 ):
    output = tokenizer.tokenize(param_dict["output"])
    # decode everything to tokens
-    output_tokens: List[str] = [
+    output_tokens: list[str] = [
        tokenizer.convert_tokens_to_string([token]) for token in output
    ]
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(

--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union

 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage)
@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor:

 def run_reasoning_extraction(
    reasoning_parser: ReasoningParser,
-    model_output: List[str],
+    model_output: list[str],
    request: Union[ChatCompletionRequest, None] = None,
    streaming: bool = False,
-) -> Tuple[Optional[str], Optional[str]]:
+) -> tuple[Optional[str], Optional[str]]:
    if streaming:
        reconstructor = run_reasoning_extraction_streaming(
            reasoning_parser,
@@ -55,9 +55,9 @@ def run_reasoning_extraction(

 def run_reasoning_extraction_nonstreaming(
    reasoning_parser: ReasoningParser,
-    model_output: List[str],
+    model_output: list[str],
    request: Union[ChatCompletionRequest, None] = None,
-) -> Tuple[Optional[str], Optional[str]]:
+) -> tuple[Optional[str], Optional[str]]:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
    return reasoning_parser.extract_reasoning_content(
        model_output=''.join(model_output), request=request)
@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(

 def run_reasoning_extraction_streaming(
    reasoning_parser: ReasoningParser,
-    model_deltas: List[str],
+    model_deltas: list[str],
    request: Union[ChatCompletionRequest, None] = None,
 ) -> StreamingReasoningReconstructor:
    request = request or ChatCompletionRequest(messages=[], model="test-model")
    reconstructor = StreamingReasoningReconstructor()
    previous_text = ""
-    previous_tokens: List[int] = []
+    previous_tokens: list[int] = []
    for delta in model_deltas:
        token_delta = [
            reasoning_parser.vocab.get(token)

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Dict, List
-
 import openai
 import pytest
 import os
@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--dtype",
-        "bfloat16",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
@@ -41,7 +37,7 @@ async def client(server):


 @pytest.fixture(scope="session")
-def base64_encoded_audio() -> Dict[str, str]:
+def base64_encoded_audio() -> dict[str, str]:
    return {
        audio_url: encode_audio_base64(*fetch_audio(audio_url))
        for audio_url in TEST_AUDIO_URLS
@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)

    message = choice.message
    message = chat_completion.choices[0].message
@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_single_chat_session_audio_base64encoded(
        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: Dict[str, str]):
+        base64_encoded_audio: dict[str, str]):

    messages = [{
        "role":
@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)

    message = choice.message
    message = chat_completion.choices[0].message
@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded(
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_single_chat_session_input_audio(
        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: Dict[str, str]):
+        base64_encoded_audio: dict[str, str]):
    messages = [{
        "role":
        "user",
@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)

    message = choice.message
    message = chat_completion.choices[0].message
@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
        temperature=0.0,
        stream=True,
    )
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
                                          model_name: str, audio_url: str,
-                                          base64_encoded_audio: Dict[str,
+                                          base64_encoded_audio: dict[str,
                                                                     str]):
    messages = [{
        "role":
@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
        temperature=0.0,
        stream=True,
    )
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                                 audio_url: str,
-                                 base64_encoded_audio: Dict[str, str]):
+                                 base64_encoded_audio: dict[str, str]):

    messages = [{
        "role":

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -2,7 +2,6 @@

 import asyncio
 from http import HTTPStatus
-from typing import List

 import openai
 import pytest
@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")


 @pytest.fixture(scope='module')
-def server_args(request: pytest.FixtureRequest) -> List[str]:
+def server_args(request: pytest.FixtureRequest) -> list[str]:
    """ Provide extra arguments to the server via indirect parametrization

    Usage:
@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
            extra_headers={
                "Content-Type": "application/x-www-form-urlencoded"
            })
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--enable-server-load-tracking"],
+                     id="enable-server-load-tracking")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_server_load(server: RemoteOpenAIServer):
+    # Check initial server load
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+    def make_long_completion_request():
+        return requests.post(
+            server.url_for("v1/completions"),
+            headers={"Content-Type": "application/json"},
+            json={
+                "prompt": "Give me a long story",
+                "max_tokens": 1000,
+                "temperature": 0,
+            },
+        )
+
+    # Start the completion request in a background thread.
+    completion_future = asyncio.create_task(
+        asyncio.to_thread(make_long_completion_request))
+
+    # Give a short delay to ensure the request has started.
+    await asyncio.sleep(0.1)
+
+    # Check server load while the completion request is running.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 1
+
+    # Wait for the completion request to finish.
+    await completion_future
+    await asyncio.sleep(0.1)
+
+    # Check server load after the completion request has finished.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,13 +3,14 @@
 # imports for guided decoding tests
 import json
 import re
-from typing import Dict, List, Optional
+from typing import Optional

 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import os
 import pytest_asyncio
+import requests
 import torch
 from openai import BadRequestError

@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
 async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
                                    model_name: str,
                                    prompt_logprobs: Optional[int]):
-    params: Dict = {
+    params: dict = {
        "messages": [{
            "role": "system",
            "content": "You are a helpful assistant."
@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
 )
 async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
                                                  model_name: str):
-    params: Dict = {
+    params: dict = {
        "messages": [{
            "role": "system",
            "content": "You are a helpful assistant."
@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
        temperature=0.0,
        stream=True,
    )
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        delta = chunk.choices[0].delta
@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):

        assert ("greater_than_equal" in exc_info.value.message
                or "less_than_equal" in exc_info.value.message)
+
+
+@pytest.mark.asyncio
+async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+    url = f"http://localhost:{server.port}/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+    }
+    data = {
+        # model_name is avoided here.
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        "max_tokens":
+        5
+    }
+
+    response = requests.post(url, headers=headers, json=data)
+    response_data = response.json()
+    print(response_data)
+
+    choice = response_data.get("choices")[0]
+    message = choice.get("message")
+    assert message is not None
+    content = message.get("content")
+    assert content is not None
+    assert len(content) > 0
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix

 # # any model with a chat template should work here
 MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
-DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501


 @pytest.fixture(scope="module")
@@ -23,8 +22,6 @@ def server():
        "--enforce-eager",
        "--max-model-len",
        "4080",
-        "--chat-template",
-        DUMMY_CHAT_TEMPLATE,
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
    # Call the function and get the result
    result = apply_hf_chat_template(
        tokenizer,
+        trust_remote_code=True,
        conversation=mock_request.messages,
        chat_template=mock_request.chat_template or template_content,
+        tools=None,
        add_generation_prompt=mock_request.add_generation_prompt,
        continue_final_message=mock_request.continue_final_message,
    )

--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+# SPDX-License-Identifier: Apache-2.0
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# a reasoning and tool calling model
+MODEL_NAME = "Qwen/QwQ-32B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
+        "--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
+        "--tool-call-parser", "hermes"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+MESSAGES = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+FUNC_NAME = "get_current_weather"
+FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+# test streaming
+@pytest.mark.asyncio
+async def test_chat_streaming_of_tool_and_reasoning(
+        client: openai.AsyncOpenAI):
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    assert len(reasoning_content) > 0
+    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
+    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
+
+
+# test full generate
+@pytest.mark.asyncio
+async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+
+    tool_calls = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    assert len(tool_calls.choices[0].message.reasoning_content) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name \
+          == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
+          == FUNC_ARGS
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -26,7 +26,7 @@ def serve_parser():
    return make_arg_parser(parser)


-### Tests for Lora module parsing
+### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
    # Test old format: name=path
    args = serve_parser.parse_args([

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -5,7 +5,7 @@ import json
 import re
 import shutil
 from tempfile import TemporaryDirectory
-from typing import Dict, List, Optional
+from typing import Optional

 import jsonschema
 import openai  # use the official client for correctness check
@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
                                          model_name: str,
                                          prompt_logprobs: Optional[int]):
-    params: Dict = {
+    params: dict = {
        "prompt": ["A robot may not injure another robot", "My name is"],
        "model": model_name,
    }
@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                             max_tokens=5,
                                             temperature=0.0,
                                             stream=True)
-    chunks: List[str] = []
+    chunks: list[str] = []
    finish_reason_count = 0
    async for chunk in stream:
        chunks.append(chunk.choices[0].text)
@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
                                             max_tokens=max_tokens,
                                             n=n,
                                             stream=True)
-    chunks: List[List[str]] = [[] for i in range(n)]
+    chunks: list[list[str]] = [[] for i in range(n)]
    finish_reason_count = 0
    async for chunk in stream:
        index = chunk.choices[0].index

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
+MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501


@@ -28,7 +28,7 @@ def server():
        "bfloat16",
        "--enforce-eager",
        "--max-model-len",
-        "8192",
+        "512",
        "--chat-template",
        DUMMY_CHAT_TEMPLATE,
    ]
@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
+    assert embeddings.usage.prompt_tokens == 11
+    assert embeddings.usage.total_tokens == 11

    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 5
    assert embeddings.usage.total_tokens == 5
@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
-    # test List[str]
+    # test list[str]
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):

    assert embeddings.id is not None
    assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 32
-    assert embeddings.usage.total_tokens == 32
+    assert embeddings.usage.prompt_tokens == 33
+    assert embeddings.usage.total_tokens == 33

-    # test List[List[int]]
+    # test list[list[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
    embedding_response = await client.embeddings.create(
@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):

    assert embeddings.id is not None
    assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 17
    assert embeddings.usage.total_tokens == 17
@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10
@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
    assert embeddings.usage.completion_tokens == 0
    assert embeddings.usage.prompt_tokens == 10
    assert embeddings.usage.total_tokens == 10

--- a/tests/entrypoints/openai/test_encoder_decoder.py
+++ b/tests/entrypoints/openai/test_encoder_decoder.py
@@ -7,7 +7,7 @@ import pytest_asyncio

 from ...utils import RemoteOpenAIServer, models_path_prefix
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.utils import is_hip
+from vllm.platforms import current_platform

 MODEL_NAME = os.path.join(models_path_prefix, "facebook/bart-base")

@@ -30,7 +30,7 @@ async def client(server):
        yield async_client


-@pytest.mark.skipif(is_hip(),
+@pytest.mark.skipif(current_platform.is_rocm(),
                    reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [
    "vllm:gpu_cache_usage_perc",
    "vllm:gpu_prefix_cache_queries",
    "vllm:gpu_prefix_cache_hits",
+    "vllm:num_preemptions_total",
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
    "vllm:iteration_tokens_total",
+    "vllm:cache_config_info",
    "vllm:request_success_total",
    "vllm:request_prompt_tokens_sum",
    "vllm:request_prompt_tokens_bucket",
@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_generation_tokens_sum",
    "vllm:request_generation_tokens_bucket",
    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
    "vllm:time_to_first_token_seconds_sum",
    "vllm:time_to_first_token_seconds_bucket",
    "vllm:time_to_first_token_seconds_count",
@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
 def test_metrics_exist_run_batch(use_v1: bool):
    if use_v1:
        pytest.skip("Skipping test on vllm V1")
-    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501

    #base_url = "0.0.0.0"
    base_url = "localhost"
@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
            "-o",
            output_file.name,
            "--model",
-            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
+            os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
            "--enable-metrics",
            "--url",
            base_url,

--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
-    # test List[str]
+    # test list[str]
    input_texts = [
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
    assert poolings.usage.prompt_tokens == 25
    assert poolings.usage.total_tokens == 25

-    # test List[List[int]]
+    # test list[list[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
    response = requests.post(

--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "bfloat16"


 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--max-model-len", "100"]
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
    query = "What is the capital of France?"
@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
    assert rerank.results[1].relevance_score <= 0.01


-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_top_n(server: RemoteOpenAIServer, model_name: str):
    query = "What is the capital of France?"
@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
    assert rerank.results[1].relevance_score <= 0.01


-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):


--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME


 @pytest.fixture(scope="module")
-def server_with_return_tokens_as_token_ids_flag(
-        default_server_args):  # noqa: F811
-    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
-    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
-        yield remote_server
+def server_fixture(request, default_server_args):  # noqa: F811
+    use_server_flag = request.param
+    if use_server_flag:
+        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+        with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+            yield (remote_server, True)
+    else:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield (remote_server, False)


 @pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
 async def test_completion_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+        server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:

        completion = await client.completions.create(
            model=MODEL_NAME,
@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
            echo=True,
            temperature=0,
            max_tokens=10,
-            logprobs=1)
+            logprobs=1,
+            extra_body=request_args)

        text = completion.choices[0].text
        token_strs = completion.choices[0].logprobs.tokens
@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(


 @pytest.mark.asyncio
-async def test_chat_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
        response = await client.chat.completions.create(
            model=MODEL_NAME,
            # Include Unicode characters to test for dividing a single
@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
            }],
            temperature=0,
            max_tokens=8,
-            logprobs=True)
+            logprobs=True,
+            extra_body=request_args)

        text = response.choices[0].message.content
        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)