Commit d2b52805 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

parents 9a521c23 5438967f
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
...@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer, ...@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert metric in response.text assert metric in response.text
@pytest.mark.asyncio
async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
# Expect no running requests or kvcache usage
assert running_requests == 0
assert waiting_requests == 0
assert kv_cache_usage == 0.0
# Start some long-running requests that we can abort
tasks = []
for _ in range(3):
task = asyncio.create_task(
client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=100, # Long generation to give time to abort
temperature=0.0))
tasks.append(task)
# Wait a bit for requests to start processing
await asyncio.sleep(0.5)
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
# Expect running requests and kvcache usage
assert running_requests > 0
assert kv_cache_usage > 0
# Cancel all tasks to abort the requests
for task in tasks:
task.cancel()
# Wait for cancellations to be processed
await asyncio.sleep(1.0)
# Check that metrics have reset to zero
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server))
assert running_requests_after == 0,\
(f"Expected 0 running requests after abort, got "
f"{running_requests_after}")
assert waiting_requests_after == 0,\
(f"Expected 0 waiting requests after abort, got "
f"{waiting_requests_after}")
assert kv_cache_usage_after == 0,\
(f"Expected 0% KV cache usage after abort, got "
f"{kv_cache_usage_after}")
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
for sample in family.samples:
if sample.name == "vllm:num_requests_running":
running_requests = sample.value
break
elif family.name == "vllm:num_requests_waiting":
for sample in family.samples:
if sample.name == "vllm:num_requests_waiting":
waiting_requests = sample.value
break
elif family.name == "vllm:gpu_cache_usage_perc":
for sample in family.samples:
if sample.name == "vllm:gpu_cache_usage_perc":
kv_cache_usage = sample.value
break
assert running_requests is not None
assert waiting_requests is not None
assert kv_cache_usage is not None
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool): def test_metrics_exist_run_batch(use_v1: bool):
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501 input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
......
...@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy): ...@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \ -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
http://localhost:8000/v1/chat/completions http://localhost:8000/v1/chat/completions
""" # noqa: E501 """ # noqa: E501
if (hasattr(case, "body") and isinstance(case.body, dict) if hasattr(case, "body") and isinstance(case.body, dict):
and "messages" in case.body if ("messages" in case.body
and isinstance(case.body["messages"], list) and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0): and len(case.body["messages"]) > 0):
for message in case.body["messages"]: for message in case.body["messages"]:
if not isinstance(message, dict): if not isinstance(message, dict):
continue continue
# Check for invalid file type in tokenize endpoint # Check for invalid file type in tokenize endpoint
if op.method.lower() == "post" and op.path == "/tokenize": if op.method.lower() == "post" and op.path == "/tokenize":
content = message.get("content", []) content = message.get("content", [])
if (isinstance(content, list) and len(content) > 0 and any( if (isinstance(content, list) and len(content) > 0
item.get("type") == "file" for item in content)): and any(
return False item.get("type") == "file"
for item in content)):
# Check for invalid tool_calls with non-function types return False
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list): # Check for invalid tool_calls with non-function types
for tool_call in tool_calls: tool_calls = message.get("tool_calls", [])
if isinstance(tool_call, dict): if isinstance(tool_calls, list):
if tool_call.get("type") != "function": for tool_call in tool_calls:
return False if isinstance(tool_call, dict):
if "custom" in tool_call: if tool_call.get("type") != "function":
return False return False
if "custom" in tool_call:
return False
# Sometimes guided_grammar is generated to be empty
# Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
guided_grammar = case.body.get("guided_grammar")
if guided_grammar == '':
# Allow None (will be handled as no grammar)
# But skip empty strings
return False
return True return True
return strategy.filter(no_invalid_types) return strategy.filter(no_invalid_types)
......
...@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base" ...@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16" DTYPE = "bfloat16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
......
...@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI ...@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
MODEL_NAME = "openai/gpt-oss-20b" MODEL_NAME = "openai/gpt-oss-20b"
DTYPE = "bfloat16"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def server(monkeypatch_module: pytest.MonkeyPatch):
args = ["--enforce-eager", "--tool-server", "demo"] args = ["--enforce-eager", "--tool-server", "demo"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with monkeypatch_module.context() as m:
yield remote_server m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture @pytest_asyncio.fixture
...@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): ...@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming(client: OpenAI, model_name: str): async def test_streaming(client: OpenAI, model_name: str):
# TODO: Add back when web search and code interpreter are available in CI
prompts = [ prompts = [
"tell me a story about a cat in 20 words", "tell me a story about a cat in 20 words",
"What is 13 * 24? Use python to calculate the result.", # "What is 13 * 24? Use python to calculate the result.",
"When did Jensen found NVIDIA? Search it and answer the year only.", # "When did Jensen found NVIDIA? Search it and answer the year only.",
] ]
for prompt in prompts: for prompt in prompts:
...@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str): ...@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
input=prompt, input=prompt,
reasoning={"effort": "low"}, reasoning={"effort": "low"},
tools=[ tools=[
{ # {
"type": "web_search_preview" # "type": "web_search_preview"
}, # },
{ # {
"type": "code_interpreter", # "type": "code_interpreter",
"container": { # "container": {
"type": "auto" # "type": "auto"
} # }
}, # },
], ],
stream=True, stream=True,
) )
...@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str): ...@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
async def test_web_search(client: OpenAI, model_name: str): async def test_web_search(client: OpenAI, model_name: str):
response = await client.responses.create( response = await client.responses.create(
model=model_name, model=model_name,
...@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str): ...@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
async def test_code_interpreter(client: OpenAI, model_name: str): async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create( response = await client.responses.create(
model=model_name, model=model_name,
...@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str): ...@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=5)
async def test_function_calling_multi_turn(client: OpenAI, model_name: str): async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
tools = [ tools = [
{ {
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_basic_completion_with_emoji(server):
"""Test basic completion with emoji to verify token_ids field."""
async with server.get_async_client() as client:
# Test with return_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body={"return_token_ids": True},
)
# Check the raw response to see the structure
completion_dict = completion.model_dump()
# Verify prompt_token_ids field is present in the completion response
assert "prompt_token_ids" in completion_dict["choices"][0]
assert isinstance(completion.choices[0].prompt_token_ids, list)
# Check against the expected prompt token IDs
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
encoded_tokens = tokenizer.encode(
"Complete this sentence with emojis: I love coding 🚀")
# Check that encoded_tokens is a subsequence of prompt_token_ids
assert any(completion.choices[0].prompt_token_ids[i:i +
len(encoded_tokens)]
== encoded_tokens for i in range(
len(completion.choices[0].prompt_token_ids) -
len(encoded_tokens) + 1))
# Verify token_ids field is present in the choice
assert completion.choices[0].token_ids is not None
assert isinstance(completion.choices[0].token_ids, list)
assert len(completion.choices[0].token_ids) > 0
# Verify decoding works correctly
decoded_text = tokenizer.decode(completion.choices[0].token_ids)
# The decoded text should contain a <|im_end|> at the end
assert decoded_text.startswith(completion.choices[0].text)
# Test without return_token_ids (should be None)
completion_without = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body={"return_token_ids": False},
)
completion_without_dict = completion_without.model_dump()
assert completion_without_dict["choices"][0].get("token_ids") is None
assert completion_without_dict.get("prompt_token_ids") is None
@pytest.mark.asyncio
async def test_chat_completion_with_tool_use(server):
"""Test chat completion with tool use (get_weather function)."""
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type":
"string",
"description":
"The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The unit of temperature",
},
},
"required": ["location"],
},
},
}]
async with server.get_async_client() as client:
# Test with return_token_ids enabled
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What's the weather like in Paris?"
},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids field is present in choices
assert response.choices[0].token_ids is not None
assert isinstance(response.choices[0].token_ids, list)
# Verify prompt_token_ids field is present
assert response.prompt_token_ids is not None
assert isinstance(response.prompt_token_ids, list)
# Verify the prompt texts and response texts
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
prompt_text = tokenizer.decode(response.prompt_token_ids)
assert prompt_text.startswith(
"<|im_start|>system\nYou are a helpful assistant.")
assert prompt_text.endswith(
"What's the weather like in Paris?<|im_end|>\n"
"<|im_start|>assistant\n")
response_text = tokenizer.decode(response.choices[0].token_ids)
assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
assert response_text.endswith("</tool_call><|im_end|>")
# If tool call was made, verify the response structure
if response.choices[0].message.tool_calls:
assert len(response.choices[0].message.tool_calls) > 0
tool_call = response.choices[0].message.tool_calls[0]
assert tool_call.function.name == "get_weather"
# Test without return_token_ids
response_without = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What's the weather like in Paris?"
},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": False},
)
assert response_without.choices[0].token_ids is None
assert response_without.prompt_token_ids is None
@pytest.mark.asyncio
async def test_comparison_with_prompt_logprobs_and_logprobs(server):
"""
Test that token_ids align with prompt_logprobs and
logprobs when return_tokens_as_token_ids is enabled.
"""
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Hello, world! How are you today?",
max_tokens=20,
temperature=0,
echo=True,
logprobs=1,
extra_body={
"return_token_ids": True,
"return_tokens_as_token_ids": True,
"prompt_logprobs": 1
},
)
# Verify all fields are present
assert completion.choices[0].token_ids is not None
assert completion.choices[0].prompt_token_ids is not None
assert completion.choices[0].prompt_logprobs is not None
assert completion.choices[0].logprobs is not None
# Extract token IDs from logprobs
# (when return_tokens_as_token_ids is True)
logprobs_token_ids = []
for token_str in completion.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
logprobs_token_ids.append(token_id)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
response_token_ids_length = len(completion.choices[0].token_ids)
assert logprobs_token_ids[-response_token_ids_length:] == \
completion.choices[0].token_ids
# Verify tokenizer consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(
completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert "Hello, world" in prompt_text
# Decode response tokens
if completion.choices[0].token_ids:
response_text = tokenizer.decode(completion.choices[0].token_ids)
assert completion.choices[0].text.endswith(response_text)
# Test streaming mode
stream = await client.completions.create(
model=MODEL_NAME,
prompt="Tell me a short fact about Python:",
max_tokens=30,
temperature=0,
stream=True,
echo=False,
logprobs=1,
extra_body={
"return_token_ids": True,
"return_tokens_as_token_ids": True
},
)
# Collect streamed tokens
streamed_prompt_token_ids = []
streamed_token_ids = []
streamed_logprob_token_ids = []
first_chunk = True
async for chunk in stream:
for token_str in chunk.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
streamed_logprob_token_ids.append(token_id)
if first_chunk:
streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
first_chunk = False
streamed_token_ids += chunk.choices[0].token_ids
# Verify we collected some tokens and first chunk had prompt_token_ids
assert len(streamed_prompt_token_ids) > 0
assert streamed_token_ids == streamed_logprob_token_ids
@pytest.mark.asyncio
async def test_chat_completion_with_emoji_and_token_ids(server):
"""Test chat completion with emojis to verify token_ids handling."""
chat_messages = [
{
"role": "system",
"content": "You like to use emojis in your responses."
},
{
"role": "user",
"content": "Repeat after me: I love cats 🐱"
},
]
async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids are present
response_dict = response.model_dump()
assert response.choices[0].token_ids is not None
assert "prompt_token_ids" in response_dict
# Verify the response contains the expected fields
assert response.choices[0].message.content is not None
# Decode token_ids and verify consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
decoded_prompt = tokenizer.decode(response.prompt_token_ids)
assert decoded_prompt.startswith(
"<|im_start|>system\nYou like to use emojis in your responses.")
assert decoded_prompt.endswith(
"I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
decoded_response = tokenizer.decode(response.choices[0].token_ids)
# The content should match the response text
# except the ending <|im_end|>
assert decoded_response == response.choices[
0].message.content + "<|im_end|>"
# Test with streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
stream=True,
extra_body={"return_token_ids": True},
)
collected_content = ""
collected_token_ids = []
first_chunk = True
async for chunk in stream:
if first_chunk:
assert chunk.prompt_token_ids is not None
assert isinstance(chunk.prompt_token_ids, list)
# Check the prompt_token_ids match the initial prompt
decoded_prompt_stream = tokenizer.decode(
chunk.prompt_token_ids)
assert decoded_prompt_stream == decoded_prompt
first_chunk = False
else:
chunk_dump = chunk.model_dump()
assert "prompt_token_ids" not in chunk_dump, \
"Subsequent chunks should not have prompt_token_ids"
if chunk.choices:
if chunk.choices[0].delta.content:
collected_content += chunk.choices[0].delta.content
# token_ids may not present in all chunks
choice_dump = chunk.choices[0].model_dump()
if "token_ids" in choice_dump:
collected_token_ids.extend(chunk.choices[0].token_ids)
# Verify we got response and token_ids
assert len(collected_content) > 0
assert len(collected_token_ids) > 0
# Verify token_ids decode properly
decoded_response = tokenizer.decode(collected_token_ids)
assert decoded_response == collected_content + "<|im_end|>"
...@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse ...@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS = [ MODELS = [
{ {
"name": "BAAI/bge-reranker-v2-m3", "name": "BAAI/bge-reranker-v2-m3",
......
...@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config(): ...@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05 assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
@pytest.mark.parametrize("model_type", ["gpt_oss", "any"])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_serving_chat_did_set_correct_cache_salt(): async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_model_config = MockModelConfig() mock_model_config = MockModelConfig()
mock_model_config.hf_config.model_type = model_type
mock_engine = MagicMock(spec=MQLLMEngineClient) mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import pytest
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf)
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
@pytest.fixture(scope="module")
def server():
global MODEL_PATH
MODEL_PATH = download_weights_from_hf(
MODEL_NAME,
allow_patterns=["*"],
cache_dir=MODEL_PATH,
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
"--skip-tokenizer-init",
"--load-format",
"dummy",
]
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_token_in_token_out_and_logprobs(server):
"""
Test token-in-token-out and token_ids align with prompt_logprobs
& logprobs when return_tokens_as_token_ids is enabled.
"""
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
text = "Hello, world! How are you today?"
token_ids = tokenizer.encode(text)
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_PATH,
prompt=token_ids,
max_tokens=20,
temperature=0,
echo=True,
extra_body={
"return_token_ids": True,
},
)
# Verify all fields are present
assert (completion.choices[0].token_ids is not None
and 0 < len(completion.choices[0].token_ids) <= 20)
assert completion.choices[0].prompt_token_ids is not None
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(
completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert prompt_text == text
...@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name): ...@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
language="en", language="en",
response_format="text", response_format="text",
temperature=0.0) temperature=0.0)
out = json.loads(transcription)['text'] out = json.loads(transcription)
assert "Mary had a little lamb," in out out_text = out['text']
out_usage = out['usage']
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client): ...@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
language="en", language="en",
response_format="text", response_format="text",
temperature=0.0) temperature=0.0)
out = json.loads(transcription)['text'] out = json.loads(transcription)
counts = out.count("Mary had a little lamb") out_text = out['text']
out_usage = out['usage']
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI): ...@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
assert response["usage"]["prompt_tokens"] == truncation_size assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 0
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size
}
with pytest.raises(openai.BadRequestError) as err:
await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert error_details["type"] == "BadRequestError"
assert "This model's maximum context length is" in error_details["message"]
assert "tokens in the input for embedding generation" in error_details[
"message"]
assert "Please reduce the length of the input" in error_details["message"]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_bigger_truncation_size(client: openai.AsyncOpenAI): async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
truncation_size = max_model_len + 1 truncation_size = max_model_len + 1
...@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI): ...@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
} }
with pytest.raises(openai.BadRequestError) as err: with pytest.raises(openai.BadRequestError) as err:
err = await client.post(path="embeddings", await client.post(path="embeddings", cast_to=object, body={**kwargs})
cast_to=object,
body={**kwargs}) assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert str(err) == f"""openai.BadRequestError: assert error_details["type"] == "BadRequestError"
Error code: 400 - {{'object': 'error', expected_message = ("truncate_prompt_tokens value is "
'message': 'truncate_prompt_tokens value "greater than max_model_len."
({truncation_size}) " Please, select a smaller truncation size.")
is greater than max_model_len ({max_model_len}). assert error_details["message"] == expected_message
Please, select a smaller truncation size.',
'type': 'BadRequestError',
'param': None, 'code': 400}}"""
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -6,8 +6,6 @@ import json ...@@ -6,8 +6,6 @@ import json
import openai import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import requests
from PIL import Image
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_base64, fetch_image
...@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): ...@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"role": "user", "role": "user",
"content": f"{placeholder}{content}", "content": f"{placeholder}{content}",
}] }]
images = [Image.open(requests.get(image_url, stream=True).raw)] images = [fetch_image(image_url)]
prompt = processor.tokenizer.apply_chat_template( prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True) messages, tokenize=False, add_generation_prompt=True)
......
...@@ -5,7 +5,6 @@ import json ...@@ -5,7 +5,6 @@ import json
import pytest import pytest
import requests import requests
from PIL import Image
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.entrypoints.openai.protocol import EmbeddingResponse
...@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): ...@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> " placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}" prompt = f"{placeholder}{content}"
images = [Image.open(requests.get(image_url, stream=True).raw)] images = [fetch_image(image_url)]
inputs = processor(prompt, images, return_tensors="pt") inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1] return inputs.input_ids.shape[1]
......
# GSM8K Accuracy Evaluation
This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
## Usage
### Run tests with pytest (like buildkite)
```bash
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
```
### Run standalone evaluation script
```bash
# Start vLLM server first
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
# Run evaluation
python tests/gsm8k/gsm8k_eval.py --port 8000
```
## Configuration Format
Model configs in `configs/` directory use this YAML format:
```yaml
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold: 0.54 # Minimum expected accuracy
num_questions: 1319 # Number of questions (default: full test set)
num_fewshot: 5 # Few-shot examples from train set
max_model_len: 4096 # Model context length
```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold: 0.74
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold: 0.31
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold: 0.45
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold: 0.60
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold: 0.375
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
Qwen3-0.6B-FP8.yaml
Llama-3.2-1B-Instruct-INT8-CT.yaml
Llama-3-8B-Instruct-nonuniform-CT.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment