Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -9,6 +9,7 @@ import pytest
import pytest_asyncio
import torch.cuda
from tests.utils import RemoteOpenAIServer
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig,
......@@ -17,8 +18,6 @@ from vllm.model_executor.model_loader.tensorizer import (
)
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer
MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
LORA_PATH = "davzoku/finqa_adapter_1b"
......
......@@ -6,11 +6,10 @@ import tempfile
import pytest
from tests.utils import RemoteOpenAIServer
from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
......
......@@ -19,8 +19,10 @@ import soundfile
import torch
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer
from vllm.tokenizers import get_tokenizer
from ....models.registry import HF_EXAMPLE_MODELS
from ....utils import RemoteOpenAIServer
......@@ -64,8 +66,12 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference):
async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)
# Load tokenizer once outside the loop
tokenizer = AutoTokenizer.from_pretrained(model)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
tokenizer = get_tokenizer(
model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
......@@ -144,20 +150,35 @@ def run_evaluation(
# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize(
"model_config",
[
("openai/whisper-large-v3", 12.744980),
# TODO (ekagra): add HF ckpt after asr release
# ("/host/engines/vllm/audio/2b-release", 11.73),
],
)
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
@pytest.mark.parametrize(
"dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"]
)
# NOTE: Expected WER measured with equivalent hf.transformers args:
# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
@pytest.mark.parametrize("expected_wer", [12.744980])
def test_wer_correctness(
model_name, dataset_repo, expected_wer, n_examples=-1, max_concurrent_request=None
model_config, dataset_repo, n_examples=-1, max_concurrent_request=None
):
model_name, expected_wer = model_config
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_name)
# TODO refactor to use `ASRDataset`
server_args = [
"--enforce-eager",
f"--tokenizer_mode={model_info.tokenizer_mode}",
]
if model_info.trust_remote_code:
server_args.append("--trust-remote-code")
with RemoteOpenAIServer(
model_name, ["--enforce-eager"], max_wait_seconds=480
model_name,
server_args,
) as remote_server:
dataset = load_hf_dataset(dataset_repo)
......@@ -167,7 +188,14 @@ def test_wer_correctness(
client = remote_server.get_async_client()
wer = run_evaluation(
model_name, client, dataset, max_concurrent_request, n_examples
model_name,
client,
dataset,
max_concurrent_request,
n_examples,
)
print(f"Expected WER: {expected_wer}, Actual WER: {wer}")
if expected_wer:
torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
......@@ -5,7 +5,7 @@ import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
......
......@@ -2,20 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import base64
import json
import warnings
import librosa
import numpy as np
import pybase64 as base64
import pytest
import websockets
from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from vllm.assets.audio import AudioAsset
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from .conftest import add_attention_backend
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
......
......@@ -8,6 +8,9 @@ from collections.abc import Callable
from typing import Any
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
logger = logging.getLogger(__name__)
......@@ -361,3 +364,38 @@ def log_response_diagnostics(
)
return diagnostics
@pytest.fixture(scope="module")
def default_server_args():
return [
"--max-model-len",
"18192",
"--enforce-eager", # For faster startup.
"--enable-auto-tool-choice",
"--structured-outputs-config.backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
]
@pytest.fixture(scope="module")
def server_with_store(default_server_args):
with RemoteOpenAIServer(
"Qwen/Qwen3-1.7B",
default_server_args,
env_dict={
"VLLM_ENABLE_RESPONSES_API_STORE": "1",
"VLLM_SERVER_DEV_MODE": "1",
},
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server_with_store):
async with server_with_store.get_async_client() as async_client:
yield async_client
......@@ -118,7 +118,6 @@ async def test_function_tool_use(
tool_choice=tool_choice,
temperature=0.0,
)
assert len(response.output) >= 1
tool_call = None
reasoning = None
......@@ -127,11 +126,43 @@ async def test_function_tool_use(
tool_call = out
if out.type == "reasoning":
reasoning = out
assert tool_call is not None
assert tool_call.type == "function_call"
assert json.loads(tool_call.arguments) is not None
assert reasoning is not None
assert reasoning.type == "reasoning"
if response.incomplete_details is None:
assert tool_call is not None
assert tool_call.type == "function_call"
assert json.loads(tool_call.arguments) is not None
assert reasoning is not None
assert reasoning.type == "reasoning"
else:
print(response.model_dump_json(indent=2))
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens_with_tool_choice_required(
client: openai.AsyncOpenAI, model_name: str
):
prompt = [
{
"role": "user",
"content": "Can you tell me what the current weather is in Berlin and the "
"forecast for the next 5 days, in fahrenheit?",
},
]
response = await client.responses.create(
model=model_name,
input=prompt,
tools=tools,
tool_choice="required",
max_output_tokens=10,
)
assert len(response.output) >= 1
for out in response.output:
# When `tool_choice="required"` and the tokens of `tools`
# exceed `max_output_tokens`,`function_call` should be empty.
# This behavior should be consistent with OpenAI
assert out.type != "function_call"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio
......
......@@ -16,7 +16,8 @@ import requests
from openai import InternalServerError, NotFoundError, OpenAI
from openai_harmony import Message
from ....utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import (
BASE_TEST_ENV,
events_contain_type,
......
......@@ -9,9 +9,9 @@ import pytest_asyncio
from openai import OpenAI
from openai_harmony import ToolDescription, ToolNamespaceConfig
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.mcp.tool_server import MCPToolServer
from ....utils import RemoteOpenAIServer
from .conftest import (
BASE_TEST_ENV,
events_contain_type,
......@@ -42,7 +42,7 @@ class TestMCPToolServerUnit:
Note: The wildcard "*" is normalized to None by
_extract_allowed_tools_from_mcp_requests before reaching this layer,
so we only test None and specific tool filtering here.
See test_serving_responses.py for "*" normalization tests.
See responses/test_serving_responses.py for "*" normalization tests.
"""
def test_get_tool_description(self):
......
......@@ -9,7 +9,8 @@ import pytest
import pytest_asyncio
from openai import OpenAI
from ....utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import (
BASE_TEST_ENV,
has_output_type,
......
......@@ -159,6 +159,7 @@ class TestInitializeToolSessions:
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
openai_serving_render=MagicMock(),
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
......@@ -245,6 +246,7 @@ class TestValidateGeneratorInput:
instance = OpenAIServingResponses(
engine_client=engine_client,
models=models,
openai_serving_render=MagicMock(),
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
......@@ -308,6 +310,7 @@ async def test_reasoning_tokens_counted_for_text_reasoning_model(monkeypatch):
serving = OpenAIServingResponses(
engine_client=engine_client,
models=models,
openai_serving_render=MagicMock(),
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
......@@ -607,6 +610,7 @@ def _make_serving_instance_with_reasoning():
serving = OpenAIServingResponses(
engine_client=engine_client,
models=models,
openai_serving_render=MagicMock(),
request_logger=None,
chat_template=None,
chat_template_content_format="auto",
......
......@@ -5,7 +5,8 @@ import pytest
import pytest_asyncio
from openai import OpenAI
from ....utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
from .conftest import validate_streaming_event_stack
MODEL_NAME = "Qwen/Qwen3-8B"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment