Commit a810671a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

parents 86b5aefe 6a09612b
...@@ -9,7 +9,7 @@ from typing import Annotated, Literal ...@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import pytest import pytest
from vllm.config import CompilationConfig, config from vllm.config import AttentionConfig, CompilationConfig, config
from vllm.engine.arg_utils import ( from vllm.engine.arg_utils import (
EngineArgs, EngineArgs,
contains_type, contains_type,
...@@ -298,6 +298,139 @@ def test_compilation_config(): ...@@ -298,6 +298,139 @@ def test_compilation_config():
) )
def test_attention_config():
from vllm.attention.backends.registry import AttentionBackendEnum
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
# default value
args = parser.parse_args([])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config == AttentionConfig()
# set backend via dot notation
args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
# set backend via --attention-backend shorthand
args = parser.parse_args(["--attention-backend", "FLASHINFER"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_backend is not None
assert engine_args.attention_backend == "FLASHINFER"
# set all fields via dot notation
args = parser.parse_args(
[
"--attention-config.backend",
"FLASH_ATTN",
"--attention-config.flash_attn_version",
"3",
"--attention-config.use_prefill_decode_attention",
"true",
"--attention-config.flash_attn_max_num_splits_for_cuda_graph",
"16",
"--attention-config.use_cudnn_prefill",
"true",
"--attention-config.use_trtllm_ragged_deepseek_prefill",
"true",
"--attention-config.use_trtllm_attention",
"true",
"--attention-config.disable_flashinfer_prefill",
"true",
"--attention-config.disable_flashinfer_q_quantization",
"true",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
assert engine_args.attention_config.flash_attn_version == 3
assert engine_args.attention_config.use_prefill_decode_attention is True
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
assert engine_args.attention_config.use_cudnn_prefill is True
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
assert engine_args.attention_config.use_trtllm_attention is True
assert engine_args.attention_config.disable_flashinfer_prefill is True
assert engine_args.attention_config.disable_flashinfer_q_quantization is True
# set to string form of a dict with all fields
args = parser.parse_args(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}',
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASHINFER"
assert engine_args.attention_config.flash_attn_version == 2
assert engine_args.attention_config.use_prefill_decode_attention is False
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
assert engine_args.attention_config.use_cudnn_prefill is False
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
assert engine_args.attention_config.use_trtllm_attention is False
assert engine_args.attention_config.disable_flashinfer_prefill is False
assert engine_args.attention_config.disable_flashinfer_q_quantization is False
# test --attention-backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
with pytest.raises(ValueError, match="mutually exclusive"):
engine_args.create_engine_config()
def test_prefix_cache_default(): def test_prefix_cache_default():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([]) args = parser.parse_args([])
......
...@@ -14,11 +14,10 @@ import requests ...@@ -14,11 +14,10 @@ import requests
from prometheus_client.parser import text_string_to_metric_families from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer from transformers import AutoTokenizer
from tests.conftest import LocalAssetServer
from tests.utils import RemoteOpenAIServer
from vllm import version from vllm import version
from ...conftest import LocalAssetServer
from ...utils import RemoteOpenAIServer
MODELS = { MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct", "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
......
...@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request, lora_request,
trace_headers, trace_headers,
priority, priority,
data_parallel_rank,
): ):
return dict(engine_prompt), {} return dict(engine_prompt), {}
......
...@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: ...@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
lora_request, lora_request,
trace_headers, trace_headers,
priority, priority,
data_parallel_rank,
): ):
return dict(engine_prompt), {} return dict(engine_prompt), {}
......
...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model=model_name, model=model_name,
input="What is 13 * 24? Use python to calculate the result.", input="What is 13 * 24? Use python to calculate the result.",
tools=[{"type": "code_interpreter", "container": {"type": "auto"}}], tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
extra_body={"enable_response_messages": True},
temperature=0.0, temperature=0.0,
) )
...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): ...@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output # make sure the correct math is in the final output
assert response.output[3].type == "message" assert response.output[3].type == "message"
assert "312" in response.output[3].content[0].text assert "312" in response.output[3].content[0].text
# test raw input_messages / output_messages
assert len(response.input_messages) == 1
assert len(response.output_messages) == 3
assert "312" in response.output_messages[2]["message"]
...@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str): ...@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
assert response.output[0].type == "reasoning" assert response.output[0].type == "reasoning"
assert response.output[1].type == "message" assert response.output[1].type == "message"
assert type(response.output[1].content[0].text) is str assert type(response.output[1].content[0].text) is str
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming_output_consistency(client: OpenAI, model_name: str):
"""Test that streaming delta text matches the final response output_text.
This test verifies that when using streaming mode:
1. The concatenated text from all 'response.output_text.delta' events
2. Matches the 'output_text' in the final 'response.completed' event
"""
response = await client.responses.create(
model=model_name,
input="Say hello in one sentence.",
stream=True,
)
events = []
async for event in response:
events.append(event)
assert len(events) > 0
# Concatenate all delta text from streaming events
streaming_text = "".join(
event.delta for event in events if event.type == "response.output_text.delta"
)
# Get the final response from the last event
response_completed_event = events[-1]
assert response_completed_event.type == "response.completed"
assert response_completed_event.response.status == "completed"
# Get output_text from the final response
final_output_text = response_completed_event.response.output_text
# Verify final response has output
assert len(response_completed_event.response.output) > 0
# Verify streaming text matches final output_text
assert streaming_text == final_output_text, (
f"Streaming text does not match final output_text.\n"
f"Streaming: {streaming_text!r}\n"
f"Final: {final_output_text!r}"
)
...@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client): ...@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
) )
assert transcription.segments is not None assert transcription.segments is not None
assert len(transcription.segments) > 0 assert len(transcription.segments) > 0
@pytest.mark.asyncio
async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": 1},
)
out = json.loads(transcription)
out_text = out["text"]
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)},
)
out = json.loads(transcription)
out_text = out["text"]
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) < 450 # ~Whisper max output len
...@@ -37,7 +37,7 @@ def server(): ...@@ -37,7 +37,7 @@ def server():
"--max-num-seqs", "--max-num-seqs",
"128", "128",
"--worker-extension-cls", "--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension", "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
] ]
with RemoteOpenAIServer( with RemoteOpenAIServer(
MODEL_NAME, MODEL_NAME,
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import requests import requests
from prometheus_client.parser import text_string_to_metric_families from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B" MODEL_NAME = "meta-llama/Llama-3.2-1B"
......
This diff is collapsed.
...@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8" ...@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold: 0.72 accuracy_threshold: 0.72
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
...@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" ...@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold: 0.45 accuracy_threshold: 0.45
num_questions: 1319 num_questions: 1319
num_fewshot: 5 num_fewshot: 5
max_model_len: 4096 server_args: "--enforce-eager --max-model-len 4096"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment