Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
...@@ -9,7 +9,7 @@ from typing import Annotated, Literal ...@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import pytest import pytest
from vllm.config import CompilationConfig, config from vllm.config import AttentionConfig, CompilationConfig, config
from vllm.engine.arg_utils import ( from vllm.engine.arg_utils import (
EngineArgs, EngineArgs,
contains_type, contains_type,
...@@ -298,6 +298,139 @@ def test_compilation_config(): ...@@ -298,6 +298,139 @@ def test_compilation_config():
) )
def test_attention_config():
from vllm.attention.backends.registry import AttentionBackendEnum
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
# default value
args = parser.parse_args([])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config == AttentionConfig()
# set backend via dot notation
args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
# set backend via --attention-backend shorthand
args = parser.parse_args(["--attention-backend", "FLASHINFER"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_backend is not None
assert engine_args.attention_backend == "FLASHINFER"
# set all fields via dot notation
args = parser.parse_args(
[
"--attention-config.backend",
"FLASH_ATTN",
"--attention-config.flash_attn_version",
"3",
"--attention-config.use_prefill_decode_attention",
"true",
"--attention-config.flash_attn_max_num_splits_for_cuda_graph",
"16",
"--attention-config.use_cudnn_prefill",
"true",
"--attention-config.use_trtllm_ragged_deepseek_prefill",
"true",
"--attention-config.use_trtllm_attention",
"true",
"--attention-config.disable_flashinfer_prefill",
"true",
"--attention-config.disable_flashinfer_q_quantization",
"true",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
assert engine_args.attention_config.flash_attn_version == 3
assert engine_args.attention_config.use_prefill_decode_attention is True
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
assert engine_args.attention_config.use_cudnn_prefill is True
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
assert engine_args.attention_config.use_trtllm_attention is True
assert engine_args.attention_config.disable_flashinfer_prefill is True
assert engine_args.attention_config.disable_flashinfer_q_quantization is True
# set to string form of a dict with all fields
args = parser.parse_args(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}',
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASHINFER"
assert engine_args.attention_config.flash_attn_version == 2
assert engine_args.attention_config.use_prefill_decode_attention is False
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
assert engine_args.attention_config.use_cudnn_prefill is False
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
assert engine_args.attention_config.use_trtllm_attention is False
assert engine_args.attention_config.disable_flashinfer_prefill is False
assert engine_args.attention_config.disable_flashinfer_q_quantization is False
# test --attention-backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
with pytest.raises(ValueError, match="mutually exclusive"):
engine_args.create_engine_config()
def test_prefix_cache_default(): def test_prefix_cache_default():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([]) args = parser.parse_args([])
......
...@@ -14,11 +14,10 @@ import requests ...@@ -14,11 +14,10 @@ import requests
from prometheus_client.parser import text_string_to_metric_families from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tests.conftest import LocalAssetServer
from tests.utils import RemoteOpenAIServer
from vllm import version from vllm import version
from ...conftest import LocalAssetServer
from ...utils import RemoteOpenAIServer
MODELS = { MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct", "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
......
...@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request, lora_request,
trace_headers, trace_headers,
priority, priority,
data_parallel_rank,
): ):
return dict(engine_prompt), {} return dict(engine_prompt), {}
......
...@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: ...@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
lora_request, lora_request,
trace_headers, trace_headers,
priority, priority,
data_parallel_rank,
): ):
return dict(engine_prompt), {} return dict(engine_prompt), {}
......
...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model=model_name, model=model_name,
input="What is 13 * 24? Use python to calculate the result.", input="What is 13 * 24? Use python to calculate the result.",
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}], tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
extra_body={"enable_response_messages": True},
temperature=0.0, temperature=0.0,
) )
...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output # make sure the correct math is in the final output
assert response.output[3].type == "message" assert response.output[3].type == "message"
assert "312" in response.output[3].content[0].text assert "312" in response.output[3].content[0].text
# test raw input_messages / output_messages
assert len(response.input_messages) == 1
assert len(response.output_messages) == 3
assert "312" in response.output_messages[2]["message"]
...@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str): ...@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
assert response.output[0].type == "reasoning" assert response.output[0].type == "reasoning"
assert response.output[1].type == "message" assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str assert type(response.output[1].content[0].text) is str
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming_output_consistency(client: OpenAI, model_name: str):
"""Test that streaming delta text matches the final response output_text.
This test verifies that when using streaming mode:
1. The concatenated text from all 'response.output_text.delta' events
2. Matches the 'output_text' in the final 'response.completed' event
"""
response = await client.responses.create(
model=model_name,
input="Say hello in one sentence.",
stream=True,
)
events = []
async for event in response:
events.append(event)
assert len(events) > 0
# Concatenate all delta text from streaming events
streaming_text = "".join(
event.delta for event in events if event.type == "response.output_text.delta"
)
# Get the final response from the last event
response_completed_event = events[-1]
assert response_completed_event.type == "response.completed"
assert response_completed_event.response.status == "completed"
# Get output_text from the final response
final_output_text = response_completed_event.response.output_text
# Verify final response has output
assert len(response_completed_event.response.output) > 0
# Verify streaming text matches final output_text
assert streaming_text == final_output_text, (
f"Streaming text does not match final output_text.\n"
f"Streaming: {streaming_text!r}\n"
f"Final: {final_output_text!r}"
)
...@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool: ...@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool:
return request.param return request.param
@pytest.fixture(
scope="module",
params=[True],
ids=["exclude_tools_when_tool_choice_none"],
)
def exclude_tools_when_tool_choice_none(request) -> bool:
return request.param
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(with_tool_parser: bool): def default_server_args(
with_tool_parser: bool, exclude_tools_when_tool_choice_none: bool
):
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--enforce-eager", "--enforce-eager",
...@@ -72,19 +83,16 @@ def default_server_args(with_tool_parser: bool): ...@@ -72,19 +83,16 @@ def default_server_args(with_tool_parser: bool):
"--enable-auto-tool-choice", "--enable-auto-tool-choice",
] ]
) )
if exclude_tools_when_tool_choice_none:
args.append("--exclude-tools-when-tool-choice-none")
return args return args
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def gptoss_server( def gptoss_server(default_server_args: list[str]):
monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str] server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
): with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
with monkeypatch_module.context() as m: yield remote_server
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
with RemoteOpenAIServer(
GPT_OSS_MODEL_NAME, default_server_args
) as remote_server:
yield remote_server
@pytest_asyncio.fixture @pytest_asyncio.fixture
...@@ -340,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content( ...@@ -340,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content(
assert response_multi_array.choices[0].message is not None assert response_multi_array.choices[0].message is not None
@pytest.mark.asyncio
async def test_gpt_oss_tool_choice_none(
gptoss_client: OpenAI,
with_tool_parser: bool,
exclude_tools_when_tool_choice_none: bool,
):
if not (with_tool_parser and exclude_tools_when_tool_choice_none):
pytest.skip(
"skip tool_choice tests when non-tool or "
"--exclude-tools-when-tool-choice-none not set"
)
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string"},
"state": {"type": "string"},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}
]
messages = [
{
"role": "user",
"content": "What's the temperature(in degrees Celsius) in Dallas?",
},
]
tool_choice_auto = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
tool_choice="auto",
temperature=0.0,
)
msg = tool_choice_auto.choices[0].message
assert len(msg.tool_calls) == 1
tool_choice_none = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
tool_choice="none",
temperature=0.0,
)
msg = tool_choice_none.choices[0].message
assert len(msg.tool_calls) == 0
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2" MODEL_NAME_SHORT = "gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
...@@ -401,6 +472,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -401,6 +472,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request, lora_request,
trace_headers, trace_headers,
priority, priority,
data_parallel_rank,
): ):
return dict(engine_prompt), {} return dict(engine_prompt), {}
......
...@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client): ...@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
) )
assert transcription.segments is not None assert transcription.segments is not None
assert len(transcription.segments) > 0 assert len(transcription.segments) > 0
@pytest.mark.asyncio
async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": 1},
)
out = json.loads(transcription)
out_text = out["text"]
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)},
)
out = json.loads(transcription)
out_text = out["text"]
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) < 450 # ~Whisper max output len
...@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model): ...@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
) )
out = json.loads(translation)["text"].strip().lower() out = json.loads(translation)["text"].strip().lower()
assert out.count("greek sea") == 2 assert out.count("greek sea") == 2
@pytest.mark.asyncio
async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
client, model_name = client_and_model
transcription = await client.audio.translations.create(
model=model_name,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": 1},
)
out = json.loads(transcription)
out_text = out["text"]
print(out_text)
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)},
)
out = json.loads(transcription)
out_text = out["text"]
print(out_text)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) < 450 # ~Whisper max output len
...@@ -37,7 +37,7 @@ def server(): ...@@ -37,7 +37,7 @@ def server():
"--max-num-seqs", "--max-num-seqs",
"128", "128",
"--worker-extension-cls", "--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension", "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
] ]
with RemoteOpenAIServer( with RemoteOpenAIServer(
MODEL_NAME, MODEL_NAME,
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import requests import requests
from prometheus_client.parser import text_string_to_metric_families from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B" MODEL_NAME = "meta-llama/Llama-3.2-1B"
......
...@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation, ...@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite) ### Run tests with pytest (like buildkite)
```bash ```bash
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \ pytest -s -v tests/evals/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \ --config-list-file=configs/models-small.txt
--tp-size=1
``` ```
### Run standalone evaluation script ### Run standalone evaluation script
...@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct" ...@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold: 0.54 # Minimum expected accuracy accuracy_threshold: 0.54 # Minimum expected accuracy
num_questions: 1319 # Number of questions (default: full test set) num_questions: 1319 # Number of questions (default: full test set)
num_fewshot: 5 # Few-shot examples from train set num_fewshot: 5 # Few-shot examples from train set
max_model_len: 4096 # Model context length server_args: "--max-model-len 4096 --tensor-parallel-size 2" # Server arguments
env: # Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4: "1"
``` ```
The `server_args` field accepts any arguments that can be passed to `vllm serve`.
The `env` field accepts a dictionary of environment variables to set for the server process.
...@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" ...@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold: 0.72 accuracy_threshold: 0.72
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
...@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" ...@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold: 0.74 accuracy_threshold: 0.74
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
...@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8" ...@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold: 0.31 accuracy_threshold: 0.31
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
...@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" ...@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold: 0.45 accuracy_threshold: 0.45
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
...@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" ...@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold: 0.60 accuracy_threshold: 0.60
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment