Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
...@@ -9,6 +9,7 @@ import pytest ...@@ -9,6 +9,7 @@ import pytest
import pytest_asyncio import pytest_asyncio
import torch.cuda import torch.cuda
from tests.utils import RemoteOpenAIServer
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import ( from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig, TensorizerConfig,
...@@ -17,8 +18,6 @@ from vllm.model_executor.model_loader.tensorizer import ( ...@@ -17,8 +18,6 @@ from vllm.model_executor.model_loader.tensorizer import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct" MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
LORA_PATH = "davzoku/finqa_adapter_1b" LORA_PATH = "davzoku/finqa_adapter_1b"
......
...@@ -6,11 +6,10 @@ import tempfile ...@@ -6,11 +6,10 @@ import tempfile
import pytest import pytest
from tests.utils import RemoteOpenAIServer
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b") MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
......
...@@ -19,8 +19,10 @@ import soundfile ...@@ -19,8 +19,10 @@ import soundfile
import torch import torch
from datasets import load_dataset from datasets import load_dataset
from evaluate import load from evaluate import load
from transformers import AutoTokenizer
from vllm.tokenizers import get_tokenizer
from ....models.registry import HF_EXAMPLE_MODELS
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
...@@ -64,8 +66,12 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference): ...@@ -64,8 +66,12 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference):
async def process_dataset(model, client, data, concurrent_request): async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request) sem = asyncio.Semaphore(concurrent_request)
# Load tokenizer once outside the loop model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
tokenizer = AutoTokenizer.from_pretrained(model) tokenizer = get_tokenizer(
model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
)
# Warmup call as the first `librosa.load` server-side is quite slow. # Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"] audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
...@@ -144,20 +150,35 @@ def run_evaluation( ...@@ -144,20 +150,35 @@ def run_evaluation(
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo".. # alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"]) # NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize(
"model_config",
[
("openai/whisper-large-v3", 12.744980),
# TODO (ekagra): add HF ckpt after asr release
# ("/host/engines/vllm/audio/2b-release", 11.73),
],
)
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice. # Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@pytest.mark.parametrize( @pytest.mark.parametrize(
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"] "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
) )
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize("expected_wer", [12.744980])
def test_wer_correctness( def test_wer_correctness(
model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None model_config, dataset_repo, n_examples=-1, max_concurrent_request=None
): ):
model_name, expected_wer = model_config
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_name)
# TODO refactor to use `ASRDataset` # TODO refactor to use `ASRDataset`
server_args = [
"--enforce-eager",
f"--tokenizer_mode={model_info.tokenizer_mode}",
]
if model_info.trust_remote_code:
server_args.append("--trust-remote-code")
with RemoteOpenAIServer( with RemoteOpenAIServer(
model_name, ["--enforce-eager"], max_wait_seconds=480 model_name,
server_args,
) as remote_server: ) as remote_server:
dataset = load_hf_dataset(dataset_repo) dataset = load_hf_dataset(dataset_repo)
...@@ -167,7 +188,14 @@ def test_wer_correctness( ...@@ -167,7 +188,14 @@ def test_wer_correctness(
client = remote_server.get_async_client() client = remote_server.get_async_client()
wer = run_evaluation( wer = run_evaluation(
model_name, client, dataset, max_concurrent_request, n_examples model_name,
client,
dataset,
max_concurrent_request,
n_examples,
) )
print(f"Expected WER: {expected_wer}, Actual WER: {wer}")
if expected_wer: if expected_wer:
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2) torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
...@@ -5,7 +5,7 @@ import openai # use the official client for correctness check ...@@ -5,7 +5,7 @@ import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
......
...@@ -2,20 +2,19 @@ ...@@ -2,20 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import base64
import json import json
import warnings import warnings
import librosa import librosa
import numpy as np import numpy as np
import pybase64 as base64
import pytest import pytest
import websockets import websockets
from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from .conftest import add_attention_backend
MISTRAL_FORMAT_ARGS = [ MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "--tokenizer_mode",
"mistral", "mistral",
......
...@@ -8,6 +8,9 @@ from collections.abc import Callable ...@@ -8,6 +8,9 @@ from collections.abc import Callable
from typing import Any from typing import Any
import pytest import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -361,3 +364,38 @@ def log_response_diagnostics( ...@@ -361,3 +364,38 @@ def log_response_diagnostics(
) )
return diagnostics return diagnostics
@pytest.fixture(scope="module")
def default_server_args():
return [
"--max-model-len",
"18192",
"--enforce-eager", # For faster startup.
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
]
@pytest.fixture(scope="module")
def server_with_store(default_server_args):
with RemoteOpenAIServer(
"Qwen/Qwen3-1.7B",
default_server_args,
env_dict={
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
"VLLM_SERVER_DEV_MODE": "1",
},
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server_with_store):
async with server_with_store.get_async_client() as async_client:
yield async_client
...@@ -118,7 +118,6 @@ async def test_function_tool_use( ...@@ -118,7 +118,6 @@ async def test_function_tool_use(
tool_choice=tool_choice, tool_choice=tool_choice,
temperature=0.0, temperature=0.0,
) )
assert len(response.output) >= 1 assert len(response.output) >= 1
tool_call = None tool_call = None
reasoning = None reasoning = None
...@@ -127,11 +126,43 @@ async def test_function_tool_use( ...@@ -127,11 +126,43 @@ async def test_function_tool_use(
tool_call = out tool_call = out
if out.type == "reasoning": if out.type == "reasoning":
reasoning = out reasoning = out
assert tool_call is not None if response.incomplete_details is None:
assert tool_call.type == "function_call" assert tool_call is not None
assert json.loads(tool_call.arguments) is not None assert tool_call.type == "function_call"
assert reasoning is not None assert json.loads(tool_call.arguments) is not None
assert reasoning.type == "reasoning" assert reasoning is not None
assert reasoning.type == "reasoning"
else:
print(response.model_dump_json(indent=2))
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens_with_tool_choice_required(
client: openai.AsyncOpenAI, model_name: str
):
prompt = [
{
"role": "user",
"content": "Can you tell me what the current weather is in Berlin and the "
"forecast for the next 5 days, in fahrenheit?",
},
]
response = await client.responses.create(
model=model_name,
input=prompt,
tools=tools,
tool_choice="required",
max_output_tokens=10,
)
assert len(response.output) >= 1
for out in response.output:
# When `tool_choice="required"` and the tokens of `tools`
# exceed `max_output_tokens`,`function_call` should be empty.
# This behavior should be consistent with OpenAI
assert out.type != "function_call"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -16,7 +16,8 @@ import requests ...@@ -16,7 +16,8 @@ import requests
from openai import InternalServerError, NotFoundError, OpenAI from openai import InternalServerError, NotFoundError, OpenAI
from openai_harmony import Message from openai_harmony import Message
from ....utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from .conftest import ( from .conftest import (
BASE_TEST_ENV, BASE_TEST_ENV,
events_contain_type, events_contain_type,
......
...@@ -9,9 +9,9 @@ import pytest_asyncio ...@@ -9,9 +9,9 @@ import pytest_asyncio
from openai import OpenAI from openai import OpenAI
from openai_harmony import ToolDescription, ToolNamespaceConfig from openai_harmony import ToolDescription, ToolNamespaceConfig
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.mcp.tool_server import MCPToolServer from vllm.entrypoints.mcp.tool_server import MCPToolServer
from ....utils import RemoteOpenAIServer
from .conftest import ( from .conftest import (
BASE_TEST_ENV, BASE_TEST_ENV,
events_contain_type, events_contain_type,
...@@ -42,7 +42,7 @@ class TestMCPToolServerUnit: ...@@ -42,7 +42,7 @@ class TestMCPToolServerUnit:
Note: The wildcard "*" is normalized to None by Note: The wildcard "*" is normalized to None by
_extract_allowed_tools_from_mcp_requests before reaching this layer, _extract_allowed_tools_from_mcp_requests before reaching this layer,
so we only test None and specific tool filtering here. so we only test None and specific tool filtering here.
See test_serving_responses.py for "*" normalization tests. See responses/test_serving_responses.py for "*" normalization tests.
""" """
def test_get_tool_description(self): def test_get_tool_description(self):
......
...@@ -9,7 +9,8 @@ import pytest ...@@ -9,7 +9,8 @@ import pytest
import pytest_asyncio import pytest_asyncio
from openai import OpenAI from openai import OpenAI
from ....utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from .conftest import ( from .conftest import (
BASE_TEST_ENV, BASE_TEST_ENV,
has_output_type, has_output_type,
......
...@@ -159,6 +159,7 @@ class TestInitializeToolSessions: ...@@ -159,6 +159,7 @@ class TestInitializeToolSessions:
instance = OpenAIServingResponses( instance = OpenAIServingResponses(
engine_client=engine_client, engine_client=engine_client,
models=models, models=models,
openai_serving_render=MagicMock(),
request_logger=None, request_logger=None,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
...@@ -245,6 +246,7 @@ class TestValidateGeneratorInput: ...@@ -245,6 +246,7 @@ class TestValidateGeneratorInput:
instance = OpenAIServingResponses( instance = OpenAIServingResponses(
engine_client=engine_client, engine_client=engine_client,
models=models, models=models,
openai_serving_render=MagicMock(),
request_logger=None, request_logger=None,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
...@@ -308,6 +310,7 @@ async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch): ...@@ -308,6 +310,7 @@ async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch):
serving = OpenAIServingResponses( serving = OpenAIServingResponses(
engine_client=engine_client, engine_client=engine_client,
models=models, models=models,
openai_serving_render=MagicMock(),
request_logger=None, request_logger=None,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
...@@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning(): ...@@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning():
serving = OpenAIServingResponses( serving = OpenAIServingResponses(
engine_client=engine_client, engine_client=engine_client,
models=models, models=models,
openai_serving_render=MagicMock(),
request_logger=None, request_logger=None,
chat_template=None, chat_template=None,
chat_template_content_format="auto", chat_template_content_format="auto",
......
...@@ -5,7 +5,8 @@ import pytest ...@@ -5,7 +5,8 @@ import pytest
import pytest_asyncio import pytest_asyncio
from openai import OpenAI from openai import OpenAI
from ....utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
from .conftest import validate_streaming_event_stack from .conftest import validate_streaming_event_stack
MODEL_NAME = "Qwen/Qwen3-8B" MODEL_NAME = "Qwen/Qwen3-8B"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment