Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import subprocess
import sys
import tempfile
......@@ -295,6 +295,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert metric in response.text
@pytest.mark.asyncio
async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool):
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
# Expect no running requests or kvcache usage
assert running_requests == 0
assert waiting_requests == 0
assert kv_cache_usage == 0.0
# Start some long-running requests that we can abort
tasks = []
for _ in range(3):
task = asyncio.create_task(
client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=100, # Long generation to give time to abort
temperature=0.0))
tasks.append(task)
# Wait a bit for requests to start processing
await asyncio.sleep(0.5)
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server))
# Expect running requests and kvcache usage
assert running_requests > 0
assert kv_cache_usage > 0
# Cancel all tasks to abort the requests
for task in tasks:
task.cancel()
# Wait for cancellations to be processed
await asyncio.sleep(1.0)
# Check that metrics have reset to zero
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server))
assert running_requests_after == 0,\
(f"Expected 0 running requests after abort, got "
f"{running_requests_after}")
assert waiting_requests_after == 0,\
(f"Expected 0 waiting requests after abort, got "
f"{waiting_requests_after}")
assert kv_cache_usage_after == 0,\
(f"Expected 0% KV cache usage after abort, got "
f"{kv_cache_usage_after}")
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
for sample in family.samples:
if sample.name == "vllm:num_requests_running":
running_requests = sample.value
break
elif family.name == "vllm:num_requests_waiting":
for sample in family.samples:
if sample.name == "vllm:num_requests_waiting":
waiting_requests = sample.value
break
elif family.name == "vllm:gpu_cache_usage_perc":
for sample in family.samples:
if sample.name == "vllm:gpu_cache_usage_perc":
kv_cache_usage = sample.value
break
assert running_requests is not None
assert waiting_requests is not None
assert kv_cache_usage is not None
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool):
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
......
......@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
http://localhost:8000/v1/chat/completions
""" # noqa: E501
if (hasattr(case, "body") and isinstance(case.body, dict)
and "messages" in case.body
and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0):
for message in case.body["messages"]:
if not isinstance(message, dict):
continue
# Check for invalid file type in tokenize endpoint
if op.method.lower() == "post" and op.path == "/tokenize":
content = message.get("content", [])
if (isinstance(content, list) and len(content) > 0 and any(
item.get("type") == "file" for item in content)):
return False
# Check for invalid tool_calls with non-function types
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if isinstance(tool_call, dict):
if tool_call.get("type") != "function":
return False
if "custom" in tool_call:
return False
if hasattr(case, "body") and isinstance(case.body, dict):
if ("messages" in case.body
and isinstance(case.body["messages"], list)
and len(case.body["messages"]) > 0):
for message in case.body["messages"]:
if not isinstance(message, dict):
continue
# Check for invalid file type in tokenize endpoint
if op.method.lower() == "post" and op.path == "/tokenize":
content = message.get("content", [])
if (isinstance(content, list) and len(content) > 0
and any(
item.get("type") == "file"
for item in content)):
return False
# Check for invalid tool_calls with non-function types
tool_calls = message.get("tool_calls", [])
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if isinstance(tool_call, dict):
if tool_call.get("type") != "function":
return False
if "custom" in tool_call:
return False
# Sometimes guided_grammar is generated to be empty
# Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
guided_grammar = case.body.get("guided_grammar")
if guided_grammar == '':
# Allow None (will be handled as no grammar)
# But skip empty strings
return False
return True
return strategy.filter(no_invalid_types)
......
......@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
......
......@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
from ...utils import RemoteOpenAIServer
pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
MODEL_NAME = "openai/gpt-oss-20b"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def server(monkeypatch_module: pytest.MonkeyPatch):
args = ["--enforce-eager", "--tool-server", "demo"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
with monkeypatch_module.context() as m:
m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
......@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming(client: OpenAI, model_name: str):
# TODO: Add back when web search and code interpreter are available in CI
prompts = [
"tell me a story about a cat in 20 words",
"What is 13 * 24? Use python to calculate the result.",
"When did Jensen found NVIDIA? Search it and answer the year only.",
# "What is 13 * 24? Use python to calculate the result.",
# "When did Jensen found NVIDIA? Search it and answer the year only.",
]
for prompt in prompts:
......@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
input=prompt,
reasoning={"effort": "low"},
tools=[
{
"type": "web_search_preview"
},
{
"type": "code_interpreter",
"container": {
"type": "auto"
}
},
# {
# "type": "web_search_preview"
# },
# {
# "type": "code_interpreter",
# "container": {
# "type": "auto"
# }
# },
],
stream=True,
)
......@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Web search tool is not available in CI yet.")
async def test_web_search(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
......@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
async def test_code_interpreter(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
......@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=5)
async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
tools = [
{
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_basic_completion_with_emoji(server):
"""Test basic completion with emoji to verify token_ids field."""
async with server.get_async_client() as client:
# Test with return_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body={"return_token_ids": True},
)
# Check the raw response to see the structure
completion_dict = completion.model_dump()
# Verify prompt_token_ids field is present in the completion response
assert "prompt_token_ids" in completion_dict["choices"][0]
assert isinstance(completion.choices[0].prompt_token_ids, list)
# Check against the expected prompt token IDs
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
encoded_tokens = tokenizer.encode(
"Complete this sentence with emojis: I love coding 🚀")
# Check that encoded_tokens is a subsequence of prompt_token_ids
assert any(completion.choices[0].prompt_token_ids[i:i +
len(encoded_tokens)]
== encoded_tokens for i in range(
len(completion.choices[0].prompt_token_ids) -
len(encoded_tokens) + 1))
# Verify token_ids field is present in the choice
assert completion.choices[0].token_ids is not None
assert isinstance(completion.choices[0].token_ids, list)
assert len(completion.choices[0].token_ids) > 0
# Verify decoding works correctly
decoded_text = tokenizer.decode(completion.choices[0].token_ids)
# The decoded text should contain a <|im_end|> at the end
assert decoded_text.startswith(completion.choices[0].text)
# Test without return_token_ids (should be None)
completion_without = await client.completions.create(
model=MODEL_NAME,
prompt="Complete this sentence with emojis: I love coding 🚀",
max_tokens=10,
temperature=0,
logprobs=1,
extra_body={"return_token_ids": False},
)
completion_without_dict = completion_without.model_dump()
assert completion_without_dict["choices"][0].get("token_ids") is None
assert completion_without_dict.get("prompt_token_ids") is None
@pytest.mark.asyncio
async def test_chat_completion_with_tool_use(server):
"""Test chat completion with tool use (get_weather function)."""
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type":
"string",
"description":
"The city and state, e.g. San Francisco, CA",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The unit of temperature",
},
},
"required": ["location"],
},
},
}]
async with server.get_async_client() as client:
# Test with return_token_ids enabled
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What's the weather like in Paris?"
},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids field is present in choices
assert response.choices[0].token_ids is not None
assert isinstance(response.choices[0].token_ids, list)
# Verify prompt_token_ids field is present
assert response.prompt_token_ids is not None
assert isinstance(response.prompt_token_ids, list)
# Verify the prompt texts and response texts
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
prompt_text = tokenizer.decode(response.prompt_token_ids)
assert prompt_text.startswith(
"<|im_start|>system\nYou are a helpful assistant.")
assert prompt_text.endswith(
"What's the weather like in Paris?<|im_end|>\n"
"<|im_start|>assistant\n")
response_text = tokenizer.decode(response.choices[0].token_ids)
assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
assert response_text.endswith("</tool_call><|im_end|>")
# If tool call was made, verify the response structure
if response.choices[0].message.tool_calls:
assert len(response.choices[0].message.tool_calls) > 0
tool_call = response.choices[0].message.tool_calls[0]
assert tool_call.function.name == "get_weather"
# Test without return_token_ids
response_without = await client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What's the weather like in Paris?"
},
],
tools=tools,
tool_choice="auto",
max_tokens=100,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": False},
)
assert response_without.choices[0].token_ids is None
assert response_without.prompt_token_ids is None
@pytest.mark.asyncio
async def test_comparison_with_prompt_logprobs_and_logprobs(server):
"""
Test that token_ids align with prompt_logprobs and
logprobs when return_tokens_as_token_ids is enabled.
"""
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_NAME,
prompt="Hello, world! How are you today?",
max_tokens=20,
temperature=0,
echo=True,
logprobs=1,
extra_body={
"return_token_ids": True,
"return_tokens_as_token_ids": True,
"prompt_logprobs": 1
},
)
# Verify all fields are present
assert completion.choices[0].token_ids is not None
assert completion.choices[0].prompt_token_ids is not None
assert completion.choices[0].prompt_logprobs is not None
assert completion.choices[0].logprobs is not None
# Extract token IDs from logprobs
# (when return_tokens_as_token_ids is True)
logprobs_token_ids = []
for token_str in completion.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
logprobs_token_ids.append(token_id)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
response_token_ids_length = len(completion.choices[0].token_ids)
assert logprobs_token_ids[-response_token_ids_length:] == \
completion.choices[0].token_ids
# Verify tokenizer consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(
completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert "Hello, world" in prompt_text
# Decode response tokens
if completion.choices[0].token_ids:
response_text = tokenizer.decode(completion.choices[0].token_ids)
assert completion.choices[0].text.endswith(response_text)
# Test streaming mode
stream = await client.completions.create(
model=MODEL_NAME,
prompt="Tell me a short fact about Python:",
max_tokens=30,
temperature=0,
stream=True,
echo=False,
logprobs=1,
extra_body={
"return_token_ids": True,
"return_tokens_as_token_ids": True
},
)
# Collect streamed tokens
streamed_prompt_token_ids = []
streamed_token_ids = []
streamed_logprob_token_ids = []
first_chunk = True
async for chunk in stream:
for token_str in chunk.choices[0].logprobs.tokens:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if token_str.startswith("token_id:"):
token_id = int(token_str.removeprefix("token_id:"))
streamed_logprob_token_ids.append(token_id)
if first_chunk:
streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
first_chunk = False
streamed_token_ids += chunk.choices[0].token_ids
# Verify we collected some tokens and first chunk had prompt_token_ids
assert len(streamed_prompt_token_ids) > 0
assert streamed_token_ids == streamed_logprob_token_ids
@pytest.mark.asyncio
async def test_chat_completion_with_emoji_and_token_ids(server):
"""Test chat completion with emojis to verify token_ids handling."""
chat_messages = [
{
"role": "system",
"content": "You like to use emojis in your responses."
},
{
"role": "user",
"content": "Repeat after me: I love cats 🐱"
},
]
async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
logprobs=True,
extra_body={"return_token_ids": True},
)
# Verify token_ids are present
response_dict = response.model_dump()
assert response.choices[0].token_ids is not None
assert "prompt_token_ids" in response_dict
# Verify the response contains the expected fields
assert response.choices[0].message.content is not None
# Decode token_ids and verify consistency
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
decoded_prompt = tokenizer.decode(response.prompt_token_ids)
assert decoded_prompt.startswith(
"<|im_start|>system\nYou like to use emojis in your responses.")
assert decoded_prompt.endswith(
"I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
decoded_response = tokenizer.decode(response.choices[0].token_ids)
# The content should match the response text
# except the ending <|im_end|>
assert decoded_response == response.choices[
0].message.content + "<|im_end|>"
# Test with streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=chat_messages,
max_tokens=50,
temperature=0,
stream=True,
extra_body={"return_token_ids": True},
)
collected_content = ""
collected_token_ids = []
first_chunk = True
async for chunk in stream:
if first_chunk:
assert chunk.prompt_token_ids is not None
assert isinstance(chunk.prompt_token_ids, list)
# Check the prompt_token_ids match the initial prompt
decoded_prompt_stream = tokenizer.decode(
chunk.prompt_token_ids)
assert decoded_prompt_stream == decoded_prompt
first_chunk = False
else:
chunk_dump = chunk.model_dump()
assert "prompt_token_ids" not in chunk_dump, \
"Subsequent chunks should not have prompt_token_ids"
if chunk.choices:
if chunk.choices[0].delta.content:
collected_content += chunk.choices[0].delta.content
# token_ids may not present in all chunks
choice_dump = chunk.choices[0].model_dump()
if "token_ids" in choice_dump:
collected_token_ids.extend(chunk.choices[0].token_ids)
# Verify we got response and token_ids
assert len(collected_content) > 0
assert len(collected_token_ids) > 0
# Verify token_ids decode properly
decoded_response = tokenizer.decode(collected_token_ids)
assert decoded_response == collected_content + "<|im_end|>"
......@@ -14,15 +14,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS = [
{
"name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
......
......@@ -284,9 +284,11 @@ async def test_serving_chat_could_load_correct_generation_config():
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
@pytest.mark.parametrize("model_type", ["gpt_oss", "any"])
@pytest.mark.asyncio
async def test_serving_chat_did_set_correct_cache_salt():
async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_model_config = MockModelConfig()
mock_model_config.hf_config.model_type = model_type
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import tempfile
import pytest
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf)
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
@pytest.fixture(scope="module")
def server():
global MODEL_PATH
MODEL_PATH = download_weights_from_hf(
MODEL_NAME,
allow_patterns=["*"],
cache_dir=MODEL_PATH,
ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
args = [
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
"--skip-tokenizer-init",
"--load-format",
"dummy",
]
with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_token_in_token_out_and_logprobs(server):
"""
Test token-in-token-out and token_ids align with prompt_logprobs
& logprobs when return_tokens_as_token_ids is enabled.
"""
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
text = "Hello, world! How are you today?"
token_ids = tokenizer.encode(text)
async with server.get_async_client() as client:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion = await client.completions.create(
model=MODEL_PATH,
prompt=token_ids,
max_tokens=20,
temperature=0,
echo=True,
extra_body={
"return_token_ids": True,
},
)
# Verify all fields are present
assert (completion.choices[0].token_ids is not None
and 0 < len(completion.choices[0].token_ids) <= 20)
assert completion.choices[0].prompt_token_ids is not None
# Decode prompt tokens
if completion.choices[0].prompt_token_ids:
prompt_text = tokenizer.decode(
completion.choices[0].prompt_token_ids)
# The decoded prompt should match or close to original prompt
assert prompt_text == text
......@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert "Mary had a little lamb," in out
out = json.loads(transcription)
out_text = out['text']
out_usage = out['usage']
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
......@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
counts = out.count("Mary had a little lamb")
out = json.loads(transcription)
out_text = out['text']
out_usage = out['usage']
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
......
......@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
assert response["usage"]["prompt_tokens"] == truncation_size
@pytest.mark.asyncio
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
truncation_size = 0
kwargs: dict[str, Any] = {
"model": MODEL_NAME,
"input": input,
"truncate_prompt_tokens": truncation_size
}
with pytest.raises(openai.BadRequestError) as err:
await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert error_details["type"] == "BadRequestError"
assert "This model's maximum context length is" in error_details["message"]
assert "tokens in the input for embedding generation" in error_details[
"message"]
assert "Please reduce the length of the input" in error_details["message"]
@pytest.mark.asyncio
async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
truncation_size = max_model_len + 1
......@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
}
with pytest.raises(openai.BadRequestError) as err:
err = await client.post(path="embeddings",
cast_to=object,
body={**kwargs})
assert str(err) == f"""openai.BadRequestError:
Error code: 400 - {{'object': 'error',
'message': 'truncate_prompt_tokens value
({truncation_size})
is greater than max_model_len ({max_model_len}).
Please, select a smaller truncation size.',
'type': 'BadRequestError',
'param': None, 'code': 400}}"""
await client.post(path="embeddings", cast_to=object, body={**kwargs})
assert err.value.status_code == 400
error_details = err.value.response.json()["error"]
assert error_details["type"] == "BadRequestError"
expected_message = ("truncate_prompt_tokens value is "
"greater than max_model_len."
" Please, select a smaller truncation size.")
assert error_details["message"] == expected_message
@pytest.mark.asyncio
......
......@@ -7,8 +7,6 @@ import openai
import pytest
import os
import pytest_asyncio
import requests
from PIL import Image
from transformers import AutoProcessor
from vllm.multimodal.utils import encode_image_base64, fetch_image
......@@ -97,7 +95,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"role": "user",
"content": f"{placeholder}{content}",
}]
images = [Image.open(requests.get(image_url, stream=True).raw)]
images = [fetch_image(image_url)]
prompt = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
......
......@@ -6,7 +6,6 @@ import json
import pytest
import requests
from PIL import Image
from transformers import AutoProcessor
from vllm.entrypoints.openai.protocol import EmbeddingResponse
......@@ -72,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder = "<|image_1|> "
prompt = f"{placeholder}{content}"
images = [Image.open(requests.get(image_url, stream=True).raw)]
images = [fetch_image(image_url)]
inputs = processor(prompt, images, return_tensors="pt")
return inputs.input_ids.shape[1]
......
# GSM8K Accuracy Evaluation
This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
## Usage
### Run tests with pytest (like buildkite)
```bash
pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
--config-list-file=configs/models-small.txt \
--tp-size=1
```
### Run standalone evaluation script
```bash
# Start vLLM server first
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
# Run evaluation
python tests/gsm8k/gsm8k_eval.py --port 8000
```
## Configuration Format
Model configs in `configs/` directory use this YAML format:
```yaml
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold: 0.54 # Minimum expected accuracy
num_questions: 1319 # Number of questions (default: full test set)
num_fewshot: 5 # Few-shot examples from train set
max_model_len: 4096 # Model context length
```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold: 0.74
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold: 0.31
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold: 0.45
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold: 0.60
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold: 0.375
num_questions: 1319
num_fewshot: 5
max_model_len: 4096
\ No newline at end of file
Qwen3-0.6B-FP8.yaml
Llama-3.2-1B-Instruct-INT8-CT.yaml
Llama-3-8B-Instruct-nonuniform-CT.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment