Merge tag 'v0.9.1' into v0.9.1-ori

cc7f22a8 · zhuwenwen · b9ea0c09 · b6553be1 · cc7f22a8 · cc7f22a8
Commit cc7f22a8 authored Jun 11, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for guided decoding tests
 import openai

--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import requests

--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Separate these tests out from test_completion and test_chat, because they
 # require launching a second server with a different flag. Running both servers

--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os

--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import subprocess
-import sys
 import tempfile
+import pytest
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 # ruff: noqa: E501
@@ -24,9 +26,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
 {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
-INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
+INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
+{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
 def test_empty_file():
    with tempfile.NamedTemporaryFile(
@@ -35,9 +41,8 @@ def test_empty_file():
        input_file.write("")
        input_file.flush()
        proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            input_file.name, "-o", output_file.name, "--model",
+            "--model", "intfloat/multilingual-e5-small"
-            "intfloat/multilingual-e5-small"
        ], )
        proc.communicate()
        proc.wait()
@@ -54,9 +59,8 @@ def test_completions():
        input_file.write(INPUT_BATCH)
        input_file.flush()
        proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            input_file.name, "-o", output_file.name, "--model",
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-            "NousResearch/Meta-Llama-3-8B-Instruct"
        ], )
        proc.communicate()
        proc.wait()
@@ -79,9 +83,8 @@ def test_completions_invalid_input():
        input_file.write(INVALID_INPUT_BATCH)
        input_file.flush()
        proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            input_file.name, "-o", output_file.name, "--model",
+            "--model", "NousResearch/Meta-Llama-3-8B-Instruct"
-            "NousResearch/Meta-Llama-3-8B-Instruct"
        ], )
        proc.communicate()
        proc.wait()
@@ -95,9 +98,8 @@ def test_embeddings():
        input_file.write(INPUT_EMBEDDING_BATCH)
        input_file.flush()
        proc = subprocess.Popen([
-            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name,
-            input_file.name, "-o", output_file.name, "--model",
+            "--model", "intfloat/multilingual-e5-small"
-            "intfloat/multilingual-e5-small"
        ], )
        proc.communicate()
        proc.wait()
@@ -110,16 +112,17 @@ def test_embeddings():
            BatchRequestOutput.model_validate_json(line)
-def test_score():
+@pytest.mark.parametrize("input_batch",
+                         [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH])
+def test_score(input_batch):
    with tempfile.NamedTemporaryFile(
            "w") as input_file, tempfile.NamedTemporaryFile(
                "r") as output_file:
-        input_file.write(INPUT_SCORE_BATCH)
+        input_file.write(input_batch)
        input_file.flush()
        proc = subprocess.Popen([
-            sys.executable,
+            "vllm",
-            "-m",
+            "run-batch",
-            "vllm.entrypoints.openai.run_batch",
            "-i",
            input_file.name,
            "-o",

--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 import pytest

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 from contextlib import suppress

--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from http import HTTPStatus
 from unittest.mock import MagicMock

--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import openai
 import pytest

--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import requests

--- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
 import json
 import tempfile

--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import pytest_asyncio
@@ -76,11 +77,11 @@ async def test_tokenize_completions(
                                 })
        response.raise_for_status()
-        assert response.json() == {
+        result = response.json()
-            "tokens": tokens,
+        assert result["tokens"] == tokens
-            "count": len(tokens),
+        assert result["count"] == len(tokens)
-            "max_model_len": 8192
+        assert result["max_model_len"] == 8192
-        }
+        assert result["token_strs"] is None
 @pytest.mark.asyncio
@@ -138,11 +139,11 @@ async def test_tokenize_chat(
                                         })
                response.raise_for_status()
-                assert response.json() == {
+                result = response.json()
-                    "tokens": tokens,
+                assert result["tokens"] == tokens
-                    "count": len(tokens),
+                assert result["count"] == len(tokens)
-                    "max_model_len": 8192
+                assert result["max_model_len"] == 8192
-                }
+                assert result["token_strs"] is None
 @pytest.mark.asyncio
@@ -215,11 +216,46 @@ async def test_tokenize_chat_with_tools(
                )
                response.raise_for_status()
-                assert response.json() == {
+                result = response.json()
-                    "tokens": tokens,
+                assert result["tokens"] == tokens
-                    "count": len(tokens),
+                assert result["count"] == len(tokens)
-                    "max_model_len": 8192,
+                assert result["max_model_len"] == 8192
-                }
+                assert result["token_strs"] is None
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name, tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_with_return_token_strs(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+    prompt = "This is a token_strs test prompt! vllm1"
+    response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "prompt": prompt,
+            "model": model_name,
+            "return_token_strs": True
+        },
+    )
+    response.raise_for_status()
+    tokens = tokenizer.encode(prompt, add_special_tokens=True)
+    tokens_str = tokenizer.convert_ids_to_tokens(tokens)
+    result = response.json()
+    assert result["tokens"] == tokens
+    assert result["count"] == len(tokens)
+    assert result["max_model_len"] == 8192
+    assert result["token_strs"] == tokens_str
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # imports for guided decoding tests
 import io

--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 import openai

--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json

--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 import pytest
@@ -191,3 +192,27 @@ def test_streaming_tool_call_with_large_steps():
    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from unittest.mock import MagicMock
+from unittest.mock import MagicMock, patch
 import pytest
@@ -159,3 +160,27 @@ def test_streaming_tool_call_with_large_steps():
    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+    with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex):
+        content, tool_calls = run_tool_extraction(tool_parser,
+                                                  fake_problematic_input,
+                                                  streaming=streaming)
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from typing import Union