Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

d2b52805 · zhuwenwen · 9a521c23 · 5438967f · d2b52805 · d2b52805
Commit d2b52805 authored Sep 07, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
 import subprocess
 import sys
 import tempfile
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
            assert metric in response.text
+@pytest.mark.asyncio
+async def test_abort_metrics_reset(server: RemoteOpenAIServer,
+                                   client: openai.AsyncClient, use_v1: bool):
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+    # Expect no running requests or kvcache usage
+    assert running_requests == 0
+    assert waiting_requests == 0
+    assert kv_cache_usage == 0.0
+    # Start some long-running requests that we can abort
+    tasks = []
+    for _ in range(3):
+        task = asyncio.create_task(
+            client.completions.create(
+                model=MODEL_NAME,
+                prompt=_TOKENIZED_PROMPT,
+                max_tokens=100,  # Long generation to give time to abort
+                temperature=0.0))
+        tasks.append(task)
+    # Wait a bit for requests to start processing
+    await asyncio.sleep(0.5)
+    # Check that we have running requests
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+    # Expect running requests and kvcache usage
+    assert running_requests > 0
+    assert kv_cache_usage > 0
+    # Cancel all tasks to abort the requests
+    for task in tasks:
+        task.cancel()
+    # Wait for cancellations to be processed
+    await asyncio.sleep(1.0)
+    # Check that metrics have reset to zero
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
+        _get_running_metrics_from_api(server))
+    assert running_requests_after == 0,\
+        (f"Expected 0 running requests after abort, got "
+         f"{running_requests_after}")
+    assert waiting_requests_after == 0,\
+        (f"Expected 0 waiting requests after abort, got "
+         f"{waiting_requests_after}")
+    assert kv_cache_usage_after == 0,\
+        (f"Expected 0% KV cache usage after abort, got "
+         f"{kv_cache_usage_after}")
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
+    """Return (running_count, waiting_count, kv_cache_usage)"""
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests, waiting_requests, kv_cache_usage = None, None, None
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:num_requests_running":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_running":
+                    running_requests = sample.value
+                    break
+        elif family.name == "vllm:num_requests_waiting":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_waiting":
+                    waiting_requests = sample.value
+                    break
+        elif family.name == "vllm:gpu_cache_usage_perc":
+            for sample in family.samples:
+                if sample.name == "vllm:gpu_cache_usage_perc":
+                    kv_cache_usage = sample.value
+                    break
+    assert running_requests is not None
+    assert waiting_requests is not None
+    assert kv_cache_usage is not None
+    return running_requests, waiting_requests, kv_cache_usage
 def test_metrics_exist_run_batch(use_v1: bool):
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501

--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
            http://localhost:8000/v1/chat/completions
        """  # noqa: E501
-        if (hasattr(case, "body") and isinstance(case.body, dict)
+        if hasattr(case, "body") and isinstance(case.body, dict):
-                and "messages" in case.body
+            if ("messages" in case.body
-                and isinstance(case.body["messages"], list)
+                    and isinstance(case.body["messages"], list)
-                and len(case.body["messages"]) > 0):
+                    and len(case.body["messages"]) > 0):
-            for message in case.body["messages"]:
+                for message in case.body["messages"]:
-                if not isinstance(message, dict):
+                    if not isinstance(message, dict):
-                    continue
+                        continue
-                # Check for invalid file type in tokenize endpoint
+                    # Check for invalid file type in tokenize endpoint
-                if op.method.lower() == "post" and op.path == "/tokenize":
+                    if op.method.lower() == "post" and op.path == "/tokenize":
-                    content = message.get("content", [])
+                        content = message.get("content", [])
-                    if (isinstance(content, list) and len(content) > 0 and any(
+                        if (isinstance(content, list) and len(content) > 0
-                            item.get("type") == "file" for item in content)):
+                                and any(
-                        return False
+                                    item.get("type") == "file"
+                                    for item in content)):
-                # Check for invalid tool_calls with non-function types
+                            return False
-                tool_calls = message.get("tool_calls", [])
-                if isinstance(tool_calls, list):
+                    # Check for invalid tool_calls with non-function types
-                    for tool_call in tool_calls:
+                    tool_calls = message.get("tool_calls", [])
-                        if isinstance(tool_call, dict):
+                    if isinstance(tool_calls, list):
-                            if tool_call.get("type") != "function":
+                        for tool_call in tool_calls:
-                                return False
+                            if isinstance(tool_call, dict):
-                            if "custom" in tool_call:
+                                if tool_call.get("type") != "function":
-                                return False
+                                    return False
+                                if "custom" in tool_call:
+                                    return False
+            # Sometimes guided_grammar is generated to be empty
+            # Causing a server error in EBNF grammar parsing
+            # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
+            guided_grammar = case.body.get("guided_grammar")
+            if guided_grammar == '':
+                # Allow None (will be handled as no grammar)
+                # But skip empty strings
+                return False
        return True
    return strategy.filter(no_invalid_types)

--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 @pytest.fixture(scope="module")
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
 from ...utils import RemoteOpenAIServer
-pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
 MODEL_NAME = "openai/gpt-oss-20b"
-DTYPE = "bfloat16"
 @pytest.fixture(scope="module")
-def server():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+@pytest.fixture(scope="module")
+def server(monkeypatch_module: pytest.MonkeyPatch):
    args = ["--enforce-eager", "--tool-server", "demo"]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with monkeypatch_module.context() as m:
-        yield remote_server
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
 @pytest_asyncio.fixture
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_streaming(client: OpenAI, model_name: str):
+    # TODO: Add back when web search and code interpreter are available in CI
    prompts = [
        "tell me a story about a cat in 20 words",
-        "What is 13 * 24? Use python to calculate the result.",
+        # "What is 13 * 24? Use python to calculate the result.",
-        "When did Jensen found NVIDIA? Search it and answer the year only.",
+        # "When did Jensen found NVIDIA? Search it and answer the year only.",
    ]
    for prompt in prompts:
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
            input=prompt,
            reasoning={"effort": "low"},
            tools=[
-                {
+                # {
-                    "type": "web_search_preview"
+                #     "type": "web_search_preview"
-                },
+                # },
-                {
+                # {
-                    "type": "code_interpreter",
+                #     "type": "code_interpreter",
-                    "container": {
+                #     "container": {
-                        "type": "auto"
+                #         "type": "auto"
-                    }
+                #     }
-                },
+                # },
            ],
            stream=True,
        )
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
 async def test_web_search(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
 async def test_code_interpreter(client: OpenAI, model_name: str):
    response = await client.responses.create(
        model=model_name,
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=5)
 async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
    tools = [
        {

--- a/tests/entrypoints/openai/test_return_token_ids.py
+++ b/tests/entrypoints/openai/test_return_token_ids.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.mark.asyncio
+async def test_basic_completion_with_emoji(server):
+    """Test basic completion with emoji to verify token_ids field."""
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": True},
+        )
+        # Check the raw response to see the structure
+        completion_dict = completion.model_dump()
+        # Verify prompt_token_ids field is present in the completion response
+        assert "prompt_token_ids" in completion_dict["choices"][0]
+        assert isinstance(completion.choices[0].prompt_token_ids, list)
+        # Check against the expected prompt token IDs
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        encoded_tokens = tokenizer.encode(
+            "Complete this sentence with emojis: I love coding 🚀")
+        # Check that encoded_tokens is a subsequence of prompt_token_ids
+        assert any(completion.choices[0].prompt_token_ids[i:i +
+                                                          len(encoded_tokens)]
+                   == encoded_tokens for i in range(
+                       len(completion.choices[0].prompt_token_ids) -
+                       len(encoded_tokens) + 1))
+        # Verify token_ids field is present in the choice
+        assert completion.choices[0].token_ids is not None
+        assert isinstance(completion.choices[0].token_ids, list)
+        assert len(completion.choices[0].token_ids) > 0
+        # Verify decoding works correctly
+        decoded_text = tokenizer.decode(completion.choices[0].token_ids)
+        # The decoded text should contain a <|im_end|> at the end
+        assert decoded_text.startswith(completion.choices[0].text)
+        # Test without return_token_ids (should be None)
+        completion_without = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": False},
+        )
+        completion_without_dict = completion_without.model_dump()
+        assert completion_without_dict["choices"][0].get("token_ids") is None
+        assert completion_without_dict.get("prompt_token_ids") is None
+@pytest.mark.asyncio
+async def test_chat_completion_with_tool_use(server):
+    """Test chat completion with tool use (get_weather function)."""
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The unit of temperature",
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }]
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+        # Verify token_ids field is present in choices
+        assert response.choices[0].token_ids is not None
+        assert isinstance(response.choices[0].token_ids, list)
+        # Verify prompt_token_ids field is present
+        assert response.prompt_token_ids is not None
+        assert isinstance(response.prompt_token_ids, list)
+        # Verify the prompt texts and response texts
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        prompt_text = tokenizer.decode(response.prompt_token_ids)
+        assert prompt_text.startswith(
+            "<|im_start|>system\nYou are a helpful assistant.")
+        assert prompt_text.endswith(
+            "What's the weather like in Paris?<|im_end|>\n"
+            "<|im_start|>assistant\n")
+        response_text = tokenizer.decode(response.choices[0].token_ids)
+        assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
+        assert response_text.endswith("</tool_call><|im_end|>")
+        # If tool call was made, verify the response structure
+        if response.choices[0].message.tool_calls:
+            assert len(response.choices[0].message.tool_calls) > 0
+            tool_call = response.choices[0].message.tool_calls[0]
+            assert tool_call.function.name == "get_weather"
+        # Test without return_token_ids
+        response_without = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": False},
+        )
+        assert response_without.choices[0].token_ids is None
+        assert response_without.prompt_token_ids is None
+@pytest.mark.asyncio
+async def test_comparison_with_prompt_logprobs_and_logprobs(server):
+    """
+    Test that token_ids align with prompt_logprobs and
+    logprobs when return_tokens_as_token_ids is enabled.
+    """
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Hello, world! How are you today?",
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True,
+                "prompt_logprobs": 1
+            },
+        )
+        # Verify all fields are present
+        assert completion.choices[0].token_ids is not None
+        assert completion.choices[0].prompt_token_ids is not None
+        assert completion.choices[0].prompt_logprobs is not None
+        assert completion.choices[0].logprobs is not None
+        # Extract token IDs from logprobs
+        # (when return_tokens_as_token_ids is True)
+        logprobs_token_ids = []
+        for token_str in completion.choices[0].logprobs.tokens:
+            # Token format is "token_id:12345" when
+            # return_tokens_as_token_ids is True
+            if token_str.startswith("token_id:"):
+                token_id = int(token_str.removeprefix("token_id:"))
+                logprobs_token_ids.append(token_id)
+        # When echo=True, the logprobs include both prompt and response tokens
+        # The token_ids field should match the the suffix of response portion
+        # The prompt_token_ids should match the prompt portion
+        assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
+        response_token_ids_length = len(completion.choices[0].token_ids)
+        assert logprobs_token_ids[-response_token_ids_length:] == \
+            completion.choices[0].token_ids
+        # Verify tokenizer consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert "Hello, world" in prompt_text
+        # Decode response tokens
+        if completion.choices[0].token_ids:
+            response_text = tokenizer.decode(completion.choices[0].token_ids)
+            assert completion.choices[0].text.endswith(response_text)
+        # Test streaming mode
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Tell me a short fact about Python:",
+            max_tokens=30,
+            temperature=0,
+            stream=True,
+            echo=False,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True
+            },
+        )
+        # Collect streamed tokens
+        streamed_prompt_token_ids = []
+        streamed_token_ids = []
+        streamed_logprob_token_ids = []
+        first_chunk = True
+        async for chunk in stream:
+            for token_str in chunk.choices[0].logprobs.tokens:
+                # Token format is "token_id:12345" when
+                # return_tokens_as_token_ids is True
+                if token_str.startswith("token_id:"):
+                    token_id = int(token_str.removeprefix("token_id:"))
+                    streamed_logprob_token_ids.append(token_id)
+            if first_chunk:
+                streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
+                first_chunk = False
+            streamed_token_ids += chunk.choices[0].token_ids
+        # Verify we collected some tokens and first chunk had prompt_token_ids
+        assert len(streamed_prompt_token_ids) > 0
+        assert streamed_token_ids == streamed_logprob_token_ids
+@pytest.mark.asyncio
+async def test_chat_completion_with_emoji_and_token_ids(server):
+    """Test chat completion with emojis to verify token_ids handling."""
+    chat_messages = [
+        {
+            "role": "system",
+            "content": "You like to use emojis in your responses."
+        },
+        {
+            "role": "user",
+            "content": "Repeat after me: I love cats 🐱"
+        },
+    ]
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+        # Verify token_ids are present
+        response_dict = response.model_dump()
+        assert response.choices[0].token_ids is not None
+        assert "prompt_token_ids" in response_dict
+        # Verify the response contains the expected fields
+        assert response.choices[0].message.content is not None
+        # Decode token_ids and verify consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        decoded_prompt = tokenizer.decode(response.prompt_token_ids)
+        assert decoded_prompt.startswith(
+            "<|im_start|>system\nYou like to use emojis in your responses.")
+        assert decoded_prompt.endswith(
+            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
+        decoded_response = tokenizer.decode(response.choices[0].token_ids)
+        # The content should match the response text
+        # except the ending <|im_end|>
+        assert decoded_response == response.choices[
+            0].message.content + "<|im_end|>"
+        # Test with streaming
+        stream = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            stream=True,
+            extra_body={"return_token_ids": True},
+        )
+        collected_content = ""
+        collected_token_ids = []
+        first_chunk = True
+        async for chunk in stream:
+            if first_chunk:
+                assert chunk.prompt_token_ids is not None
+                assert isinstance(chunk.prompt_token_ids, list)
+                # Check the prompt_token_ids match the initial prompt
+                decoded_prompt_stream = tokenizer.decode(
+                    chunk.prompt_token_ids)
+                assert decoded_prompt_stream == decoded_prompt
+                first_chunk = False
+            else:
+                chunk_dump = chunk.model_dump()
+                assert "prompt_token_ids" not in chunk_dump, \
+                    "Subsequent chunks should not have prompt_token_ids"
+            if chunk.choices:
+                if chunk.choices[0].delta.content:
+                    collected_content += chunk.choices[0].delta.content
+                # token_ids may not present in all chunks
+                choice_dump = chunk.choices[0].model_dump()
+                if "token_ids" in choice_dump:
+                    collected_token_ids.extend(chunk.choices[0].token_ids)
+        # Verify we got response and token_ids
+        assert len(collected_content) > 0
+        assert len(collected_token_ids) > 0
+        # Verify token_ids decode properly
+        decoded_response = tokenizer.decode(collected_token_ids)
+        assert decoded_response == collected_content + "<|im_end|>"
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
 from ...utils import RemoteOpenAIServer
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
 MODELS = [
    {
        "name": "BAAI/bge-reranker-v2-m3",

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+@pytest.mark.parametrize("model_type", ["gpt_oss", "any"])
 @pytest.mark.asyncio
-async def test_serving_chat_did_set_correct_cache_salt():
+async def test_serving_chat_did_set_correct_cache_salt(model_type):
    mock_model_config = MockModelConfig()
+    mock_model_config.hf_config.model_type = model_type
    mock_engine = MagicMock(spec=MQLLMEngineClient)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)

--- a/tests/entrypoints/openai/test_token_in_token_out.py
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import tempfile
+import pytest
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+        # Verify all fields are present
+        assert (completion.choices[0].token_ids is not None
+                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert completion.choices[0].prompt_token_ids is not None
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
            language="en",
            response_format="text",
            temperature=0.0)
-        out = json.loads(transcription)['text']
+        out = json.loads(transcription)
-        assert "Mary had a little lamb," in out
+        out_text = out['text']
+        out_usage = out['usage']
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
 @pytest.mark.asyncio
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
        language="en",
        response_format="text",
        temperature=0.0)
-    out = json.loads(transcription)['text']
+    out = json.loads(transcription)
-    counts = out.count("Mary had a little lamb")
+    out_text = out['text']
+    out_usage = out['usage']
+    counts = out_text.count("Mary had a little lamb")
    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
    assert response["usage"]["prompt_tokens"] == truncation_size
+@pytest.mark.asyncio
+async def test_zero_truncation_size(client: openai.AsyncOpenAI):
+    truncation_size = 0
+    kwargs: dict[str, Any] = {
+        "model": MODEL_NAME,
+        "input": input,
+        "truncate_prompt_tokens": truncation_size
+    }
+    with pytest.raises(openai.BadRequestError) as err:
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
+    assert error_details["type"] == "BadRequestError"
+    assert "This model's maximum context length is" in error_details["message"]
+    assert "tokens in the input for embedding generation" in error_details[
+        "message"]
+    assert "Please reduce the length of the input" in error_details["message"]
 @pytest.mark.asyncio
 async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
    truncation_size = max_model_len + 1
@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
    }
    with pytest.raises(openai.BadRequestError) as err:
-        err = await client.post(path="embeddings",
+        await client.post(path="embeddings", cast_to=object, body={**kwargs})
-                                cast_to=object,
-                                body={**kwargs})
+    assert err.value.status_code == 400
+    error_details = err.value.response.json()["error"]
-        assert str(err) == f"""openai.BadRequestError: 
+    assert error_details["type"] == "BadRequestError"
-                    Error code: 400 - {{'object': 'error', 
+    expected_message = ("truncate_prompt_tokens value is "
-                    'message': 'truncate_prompt_tokens value 
+                        "greater than max_model_len."
-                    ({truncation_size}) 
+                        " Please, select a smaller truncation size.")
-                    is greater than max_model_len ({max_model_len}). 
+    assert error_details["message"] == expected_message
-                    Please, select a smaller truncation size.', 
-                    'type': 'BadRequestError', 
-                    'param': None, 'code': 400}}"""
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -6,8 +6,6 @@ import json
 import openai
 import pytest
 import pytest_asyncio
-import requests
-from PIL import Image
 from transformers import AutoProcessor
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
        "role": "user",
        "content": f"{placeholder}{content}",
    }]
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
    prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -5,7 +5,6 @@ import json
 import pytest
 import requests
-from PIL import Image
 from transformers import AutoProcessor
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
    placeholder = "<|image_1|> "
    prompt = f"{placeholder}{content}"
-    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    images = [fetch_image(image_url)]
    inputs = processor(prompt, images, return_tensors="pt")
    return inputs.input_ids.shape[1]

--- a/tests/evals/gsm8k/README.md
+++ b/tests/evals/gsm8k/README.md
+# GSM8K Accuracy Evaluation
+This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
+## Usage
+### Run tests with pytest (like buildkite)
+```bash
+pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+```
+### Run standalone evaluation script
+```bash
+# Start vLLM server first
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
+# Run evaluation
+python tests/gsm8k/gsm8k_eval.py --port 8000
+```
+## Configuration Format
+Model configs in `configs/` directory use this YAML format:
+```yaml
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+accuracy_threshold: 0.54  # Minimum expected accuracy
+num_questions: 1319       # Number of questions (default: full test set)
+num_fewshot: 5            # Few-shot examples from train set
+max_model_len: 4096       # Model context length
+```
--- a/tests/evals/gsm8k/__init__.py
+++ b/tests/evals/gsm8k/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+accuracy_threshold: 0.74
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
+accuracy_threshold: 0.31
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+accuracy_threshold: 0.45
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+accuracy_threshold: 0.60
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+model_name: "Qwen/Qwen3-0.6B-FP8"
+accuracy_threshold: 0.375
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/models-small.txt
+++ b/tests/evals/gsm8k/configs/models-small.txt
+Qwen3-0.6B-FP8.yaml
+Llama-3.2-1B-Instruct-INT8-CT.yaml
+Llama-3-8B-Instruct-nonuniform-CT.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml