Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
501
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
720 additions
and
82 deletions
+720
-82
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+94
-1
tests/entrypoints/openai/test_openai_schema.py
tests/entrypoints/openai/test_openai_schema.py
+38
-25
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+0
-8
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+28
-17
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_token_ids.py
+374
-0
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+0
-9
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+3
-1
tests/entrypoints/openai/test_token_in_token_out.py
tests/entrypoints/openai/test_token_in_token_out.py
+73
-0
tests/entrypoints/openai/test_transcription_validation.py
tests/entrypoints/openai/test_transcription_validation.py
+10
-4
tests/entrypoints/openai/test_truncation.py
tests/entrypoints/openai/test_truncation.py
+31
-12
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+1
-3
tests/entrypoints/openai/test_vision_embedding.py
tests/entrypoints/openai/test_vision_embedding.py
+1
-2
tests/evals/gsm8k/README.md
tests/evals/gsm8k/README.md
+35
-0
tests/evals/gsm8k/__init__.py
tests/evals/gsm8k/__init__.py
+2
-0
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
...vals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...als/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+5
-0
tests/evals/gsm8k/configs/models-small.txt
tests/evals/gsm8k/configs/models-small.txt
+5
-0
No files found.
Too many changes to show.
To preserve performance only
501 of 501+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/test_metrics.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
subprocess
import
sys
import
tempfile
...
...
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
metric
in
response
.
text
@
pytest
.
mark
.
asyncio
async
def
test_abort_metrics_reset
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
):
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
# Expect no running requests or kvcache usage
assert
running_requests
==
0
assert
waiting_requests
==
0
assert
kv_cache_usage
==
0.0
# Start some long-running requests that we can abort
tasks
=
[]
for
_
in
range
(
3
):
task
=
asyncio
.
create_task
(
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
_TOKENIZED_PROMPT
,
max_tokens
=
100
,
# Long generation to give time to abort
temperature
=
0.0
))
tasks
.
append
(
task
)
# Wait a bit for requests to start processing
await
asyncio
.
sleep
(
0.5
)
# Check that we have running requests
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
# Expect running requests and kvcache usage
assert
running_requests
>
0
assert
kv_cache_usage
>
0
# Cancel all tasks to abort the requests
for
task
in
tasks
:
task
.
cancel
()
# Wait for cancellations to be processed
await
asyncio
.
sleep
(
1.0
)
# Check that metrics have reset to zero
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
_get_running_metrics_from_api
(
server
))
assert
running_requests_after
==
0
,
\
(
f
"Expected 0 running requests after abort, got "
f
"
{
running_requests_after
}
"
)
assert
waiting_requests_after
==
0
,
\
(
f
"Expected 0 waiting requests after abort, got "
f
"
{
waiting_requests_after
}
"
)
assert
kv_cache_usage_after
==
0
,
\
(
f
"Expected 0% KV cache usage after abort, got "
f
"
{
kv_cache_usage_after
}
"
)
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
"vllm:num_requests_running"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:num_requests_running"
:
running_requests
=
sample
.
value
break
elif
family
.
name
==
"vllm:num_requests_waiting"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:num_requests_waiting"
:
waiting_requests
=
sample
.
value
break
elif
family
.
name
==
"vllm:gpu_cache_usage_perc"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:gpu_cache_usage_perc"
:
kv_cache_usage
=
sample
.
value
break
assert
running_requests
is
not
None
assert
waiting_requests
is
not
None
assert
kv_cache_usage
is
not
None
return
running_requests
,
waiting_requests
,
kv_cache_usage
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""
# noqa: E501
...
...
tests/entrypoints/openai/test_openai_schema.py
View file @
d2b52805
...
...
@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}'
\
http://localhost:8000/v1/chat/completions
"""
# noqa: E501
if
(
hasattr
(
case
,
"body"
)
and
isinstance
(
case
.
body
,
dict
)
and
"messages"
in
case
.
body
and
isinstance
(
case
.
body
[
"messages"
],
list
)
and
len
(
case
.
body
[
"messages"
])
>
0
):
for
message
in
case
.
body
[
"messages"
]:
if
not
isinstance
(
message
,
dict
):
continue
# Check for invalid file type in tokenize endpoint
if
op
.
method
.
lower
()
==
"post"
and
op
.
path
==
"/tokenize"
:
content
=
message
.
get
(
"content"
,
[])
if
(
isinstance
(
content
,
list
)
and
len
(
content
)
>
0
and
any
(
item
.
get
(
"type"
)
==
"file"
for
item
in
content
)):
return
False
# Check for invalid tool_calls with non-function types
tool_calls
=
message
.
get
(
"tool_calls"
,
[])
if
isinstance
(
tool_calls
,
list
):
for
tool_call
in
tool_calls
:
if
isinstance
(
tool_call
,
dict
):
if
tool_call
.
get
(
"type"
)
!=
"function"
:
return
False
if
"custom"
in
tool_call
:
return
False
if
hasattr
(
case
,
"body"
)
and
isinstance
(
case
.
body
,
dict
):
if
(
"messages"
in
case
.
body
and
isinstance
(
case
.
body
[
"messages"
],
list
)
and
len
(
case
.
body
[
"messages"
])
>
0
):
for
message
in
case
.
body
[
"messages"
]:
if
not
isinstance
(
message
,
dict
):
continue
# Check for invalid file type in tokenize endpoint
if
op
.
method
.
lower
()
==
"post"
and
op
.
path
==
"/tokenize"
:
content
=
message
.
get
(
"content"
,
[])
if
(
isinstance
(
content
,
list
)
and
len
(
content
)
>
0
and
any
(
item
.
get
(
"type"
)
==
"file"
for
item
in
content
)):
return
False
# Check for invalid tool_calls with non-function types
tool_calls
=
message
.
get
(
"tool_calls"
,
[])
if
isinstance
(
tool_calls
,
list
):
for
tool_call
in
tool_calls
:
if
isinstance
(
tool_call
,
dict
):
if
tool_call
.
get
(
"type"
)
!=
"function"
:
return
False
if
"custom"
in
tool_call
:
return
False
# Sometimes guided_grammar is generated to be empty
# Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
guided_grammar
=
case
.
body
.
get
(
"guided_grammar"
)
if
guided_grammar
==
''
:
# Allow None (will be handled as no grammar)
# But skip empty strings
return
False
return
True
return
strategy
.
filter
(
no_invalid_types
)
...
...
tests/entrypoints/openai/test_rerank.py
View file @
d2b52805
...
...
@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
...
...
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
d2b52805
...
...
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
from
...utils
import
RemoteOpenAIServer
pytest
.
skip
(
allow_module_level
=
True
,
reason
=
"gpt-oss can't run on CI yet."
)
MODEL_NAME
=
"openai/gpt-oss-20b"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
):
args
=
[
"--enforce-eager"
,
"--tool-server"
,
"demo"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
with
monkeypatch_module
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ENABLE_RESPONSES_API_STORE"
,
"1"
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
...
...
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
):
# TODO: Add back when web search and code interpreter are available in CI
prompts
=
[
"tell me a story about a cat in 20 words"
,
"What is 13 * 24? Use python to calculate the result."
,
"When did Jensen found NVIDIA? Search it and answer the year only."
,
#
"What is 13 * 24? Use python to calculate the result.",
#
"When did Jensen found NVIDIA? Search it and answer the year only.",
]
for
prompt
in
prompts
:
...
...
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
input
=
prompt
,
reasoning
=
{
"effort"
:
"low"
},
tools
=
[
{
"type"
:
"web_search_preview"
},
{
"type"
:
"code_interpreter"
,
"container"
:
{
"type"
:
"auto"
}
},
#
{
#
"type": "web_search_preview"
#
},
#
{
#
"type": "code_interpreter",
#
"container": {
#
"type": "auto"
#
}
#
},
],
stream
=
True
,
)
...
...
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"Web search tool is not available in CI yet."
)
async
def
test_web_search
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
...
...
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"Code interpreter tool is not available in CI yet."
)
async
def
test_code_interpreter
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
...
...
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
flaky
(
reruns
=
5
)
async
def
test_function_calling_multi_turn
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
{
...
...
tests/entrypoints/openai/test_return_token_ids.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen2.5-1.5B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"hermes"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_basic_completion_with_emoji
(
server
):
"""Test basic completion with emoji to verify token_ids field."""
async
with
server
.
get_async_client
()
as
client
:
# Test with return_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Complete this sentence with emojis: I love coding 🚀"
,
max_tokens
=
10
,
temperature
=
0
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Check the raw response to see the structure
completion_dict
=
completion
.
model_dump
()
# Verify prompt_token_ids field is present in the completion response
assert
"prompt_token_ids"
in
completion_dict
[
"choices"
][
0
]
assert
isinstance
(
completion
.
choices
[
0
].
prompt_token_ids
,
list
)
# Check against the expected prompt token IDs
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
encoded_tokens
=
tokenizer
.
encode
(
"Complete this sentence with emojis: I love coding 🚀"
)
# Check that encoded_tokens is a subsequence of prompt_token_ids
assert
any
(
completion
.
choices
[
0
].
prompt_token_ids
[
i
:
i
+
len
(
encoded_tokens
)]
==
encoded_tokens
for
i
in
range
(
len
(
completion
.
choices
[
0
].
prompt_token_ids
)
-
len
(
encoded_tokens
)
+
1
))
# Verify token_ids field is present in the choice
assert
completion
.
choices
[
0
].
token_ids
is
not
None
assert
isinstance
(
completion
.
choices
[
0
].
token_ids
,
list
)
assert
len
(
completion
.
choices
[
0
].
token_ids
)
>
0
# Verify decoding works correctly
decoded_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
token_ids
)
# The decoded text should contain a <|im_end|> at the end
assert
decoded_text
.
startswith
(
completion
.
choices
[
0
].
text
)
# Test without return_token_ids (should be None)
completion_without
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Complete this sentence with emojis: I love coding 🚀"
,
max_tokens
=
10
,
temperature
=
0
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
False
},
)
completion_without_dict
=
completion_without
.
model_dump
()
assert
completion_without_dict
[
"choices"
][
0
].
get
(
"token_ids"
)
is
None
assert
completion_without_dict
.
get
(
"prompt_token_ids"
)
is
None
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_with_tool_use
(
server
):
"""Test chat completion with tool use (get_weather function)."""
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"location"
:
{
"type"
:
"string"
,
"description"
:
"The city and state, e.g. San Francisco, CA"
,
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
"description"
:
"The unit of temperature"
,
},
},
"required"
:
[
"location"
],
},
},
}]
async
with
server
.
get_async_client
()
as
client
:
# Test with return_token_ids enabled
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris?"
},
],
tools
=
tools
,
tool_choice
=
"auto"
,
max_tokens
=
100
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Verify token_ids field is present in choices
assert
response
.
choices
[
0
].
token_ids
is
not
None
assert
isinstance
(
response
.
choices
[
0
].
token_ids
,
list
)
# Verify prompt_token_ids field is present
assert
response
.
prompt_token_ids
is
not
None
assert
isinstance
(
response
.
prompt_token_ids
,
list
)
# Verify the prompt texts and response texts
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
prompt_text
=
tokenizer
.
decode
(
response
.
prompt_token_ids
)
assert
prompt_text
.
startswith
(
"<|im_start|>system
\n
You are a helpful assistant."
)
assert
prompt_text
.
endswith
(
"What's the weather like in Paris?<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
response_text
=
tokenizer
.
decode
(
response
.
choices
[
0
].
token_ids
)
assert
response_text
.
startswith
(
'<tool_call>
\n
{"name": "get_weather"'
)
assert
response_text
.
endswith
(
"</tool_call><|im_end|>"
)
# If tool call was made, verify the response structure
if
response
.
choices
[
0
].
message
.
tool_calls
:
assert
len
(
response
.
choices
[
0
].
message
.
tool_calls
)
>
0
tool_call
=
response
.
choices
[
0
].
message
.
tool_calls
[
0
]
assert
tool_call
.
function
.
name
==
"get_weather"
# Test without return_token_ids
response_without
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris?"
},
],
tools
=
tools
,
tool_choice
=
"auto"
,
max_tokens
=
100
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
False
},
)
assert
response_without
.
choices
[
0
].
token_ids
is
None
assert
response_without
.
prompt_token_ids
is
None
@
pytest
.
mark
.
asyncio
async
def
test_comparison_with_prompt_logprobs_and_logprobs
(
server
):
"""
Test that token_ids align with prompt_logprobs and
logprobs when return_tokens_as_token_ids is enabled.
"""
async
with
server
.
get_async_client
()
as
client
:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, world! How are you today?"
,
max_tokens
=
20
,
temperature
=
0
,
echo
=
True
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
,
"return_tokens_as_token_ids"
:
True
,
"prompt_logprobs"
:
1
},
)
# Verify all fields are present
assert
completion
.
choices
[
0
].
token_ids
is
not
None
assert
completion
.
choices
[
0
].
prompt_token_ids
is
not
None
assert
completion
.
choices
[
0
].
prompt_logprobs
is
not
None
assert
completion
.
choices
[
0
].
logprobs
is
not
None
# Extract token IDs from logprobs
# (when return_tokens_as_token_ids is True)
logprobs_token_ids
=
[]
for
token_str
in
completion
.
choices
[
0
].
logprobs
.
tokens
:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if
token_str
.
startswith
(
"token_id:"
):
token_id
=
int
(
token_str
.
removeprefix
(
"token_id:"
))
logprobs_token_ids
.
append
(
token_id
)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
assert
logprobs_token_ids
[
-
response_token_ids_length
:]
==
\
completion
.
choices
[
0
].
token_ids
# Verify tokenizer consistency
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Decode prompt tokens
if
completion
.
choices
[
0
].
prompt_token_ids
:
prompt_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
prompt_token_ids
)
# The decoded prompt should match or close to original prompt
assert
"Hello, world"
in
prompt_text
# Decode response tokens
if
completion
.
choices
[
0
].
token_ids
:
response_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
token_ids
)
assert
completion
.
choices
[
0
].
text
.
endswith
(
response_text
)
# Test streaming mode
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Tell me a short fact about Python:"
,
max_tokens
=
30
,
temperature
=
0
,
stream
=
True
,
echo
=
False
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
,
"return_tokens_as_token_ids"
:
True
},
)
# Collect streamed tokens
streamed_prompt_token_ids
=
[]
streamed_token_ids
=
[]
streamed_logprob_token_ids
=
[]
first_chunk
=
True
async
for
chunk
in
stream
:
for
token_str
in
chunk
.
choices
[
0
].
logprobs
.
tokens
:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if
token_str
.
startswith
(
"token_id:"
):
token_id
=
int
(
token_str
.
removeprefix
(
"token_id:"
))
streamed_logprob_token_ids
.
append
(
token_id
)
if
first_chunk
:
streamed_prompt_token_ids
=
chunk
.
choices
[
0
].
prompt_token_ids
first_chunk
=
False
streamed_token_ids
+=
chunk
.
choices
[
0
].
token_ids
# Verify we collected some tokens and first chunk had prompt_token_ids
assert
len
(
streamed_prompt_token_ids
)
>
0
assert
streamed_token_ids
==
streamed_logprob_token_ids
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_with_emoji_and_token_ids
(
server
):
"""Test chat completion with emojis to verify token_ids handling."""
chat_messages
=
[
{
"role"
:
"system"
,
"content"
:
"You like to use emojis in your responses."
},
{
"role"
:
"user"
,
"content"
:
"Repeat after me: I love cats 🐱"
},
]
async
with
server
.
get_async_client
()
as
client
:
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
chat_messages
,
max_tokens
=
50
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Verify token_ids are present
response_dict
=
response
.
model_dump
()
assert
response
.
choices
[
0
].
token_ids
is
not
None
assert
"prompt_token_ids"
in
response_dict
# Verify the response contains the expected fields
assert
response
.
choices
[
0
].
message
.
content
is
not
None
# Decode token_ids and verify consistency
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
decoded_prompt
=
tokenizer
.
decode
(
response
.
prompt_token_ids
)
assert
decoded_prompt
.
startswith
(
"<|im_start|>system
\n
You like to use emojis in your responses."
)
assert
decoded_prompt
.
endswith
(
"I love cats 🐱<|im_end|>
\n
<|im_start|>assistant
\n
"
)
decoded_response
=
tokenizer
.
decode
(
response
.
choices
[
0
].
token_ids
)
# The content should match the response text
# except the ending <|im_end|>
assert
decoded_response
==
response
.
choices
[
0
].
message
.
content
+
"<|im_end|>"
# Test with streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
chat_messages
,
max_tokens
=
50
,
temperature
=
0
,
stream
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
collected_content
=
""
collected_token_ids
=
[]
first_chunk
=
True
async
for
chunk
in
stream
:
if
first_chunk
:
assert
chunk
.
prompt_token_ids
is
not
None
assert
isinstance
(
chunk
.
prompt_token_ids
,
list
)
# Check the prompt_token_ids match the initial prompt
decoded_prompt_stream
=
tokenizer
.
decode
(
chunk
.
prompt_token_ids
)
assert
decoded_prompt_stream
==
decoded_prompt
first_chunk
=
False
else
:
chunk_dump
=
chunk
.
model_dump
()
assert
"prompt_token_ids"
not
in
chunk_dump
,
\
"Subsequent chunks should not have prompt_token_ids"
if
chunk
.
choices
:
if
chunk
.
choices
[
0
].
delta
.
content
:
collected_content
+=
chunk
.
choices
[
0
].
delta
.
content
# token_ids may not present in all chunks
choice_dump
=
chunk
.
choices
[
0
].
model_dump
()
if
"token_ids"
in
choice_dump
:
collected_token_ids
.
extend
(
chunk
.
choices
[
0
].
token_ids
)
# Verify we got response and token_ids
assert
len
(
collected_content
)
>
0
assert
len
(
collected_token_ids
)
>
0
# Verify token_ids decode properly
decoded_response
=
tokenizer
.
decode
(
collected_token_ids
)
assert
decoded_response
==
collected_content
+
"<|im_end|>"
tests/entrypoints/openai/test_score.py
View file @
d2b52805
...
...
@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS
=
[
{
"name"
:
"BAAI/bge-reranker-v2-m3"
,
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
d2b52805
...
...
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
repetition_penalty
==
1.05
@
pytest
.
mark
.
parametrize
(
"model_type"
,
[
"gpt_oss"
,
"any"
])
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_did_set_correct_cache_salt
():
async
def
test_serving_chat_did_set_correct_cache_salt
(
model_type
):
mock_model_config
=
MockModelConfig
()
mock_model_config
.
hf_config
.
model_type
=
model_type
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
...
...
tests/entrypoints/openai/test_token_in_token_out.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
tempfile
import
pytest
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
)
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
MODEL_PATH
=
os
.
path
.
join
(
tempfile
.
gettempdir
(),
"qwen3_06b"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
global
MODEL_PATH
MODEL_PATH
=
download_weights_from_hf
(
MODEL_NAME
,
allow_patterns
=
[
"*"
],
cache_dir
=
MODEL_PATH
,
ignore_patterns
=
[
"tokenizer*"
,
"vocab*"
,
"*.safetensors"
])
args
=
[
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
"--skip-tokenizer-init"
,
"--load-format"
,
"dummy"
,
]
with
RemoteOpenAIServer
(
MODEL_PATH
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_token_in_token_out_and_logprobs
(
server
):
"""
Test token-in-token-out and token_ids align with prompt_logprobs
& logprobs when return_tokens_as_token_ids is enabled.
"""
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
text
=
"Hello, world! How are you today?"
token_ids
=
tokenizer
.
encode
(
text
)
async
with
server
.
get_async_client
()
as
client
:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_PATH
,
prompt
=
token_ids
,
max_tokens
=
20
,
temperature
=
0
,
echo
=
True
,
extra_body
=
{
"return_token_ids"
:
True
,
},
)
# Verify all fields are present
assert
(
completion
.
choices
[
0
].
token_ids
is
not
None
and
0
<
len
(
completion
.
choices
[
0
].
token_ids
)
<=
20
)
assert
completion
.
choices
[
0
].
prompt_token_ids
is
not
None
# Decode prompt tokens
if
completion
.
choices
[
0
].
prompt_token_ids
:
prompt_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
prompt_token_ids
)
# The decoded prompt should match or close to original prompt
assert
prompt_text
==
text
tests/entrypoints/openai/test_transcription_validation.py
View file @
d2b52805
...
...
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
"Mary had a little lamb,"
in
out
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
'text'
]
out_usage
=
out
[
'usage'
]
assert
"Mary had a little lamb,"
in
out_text
assert
out_usage
[
"seconds"
]
==
16
,
out_usage
[
"seconds"
]
@
pytest
.
mark
.
asyncio
...
...
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
counts
=
out
.
count
(
"Mary had a little lamb"
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
'text'
]
out_usage
=
out
[
'usage'
]
counts
=
out_text
.
count
(
"Mary had a little lamb"
)
assert
counts
==
10
,
counts
assert
out_usage
[
"seconds"
]
==
161
,
out_usage
[
"seconds"
]
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_truncation.py
View file @
d2b52805
...
...
@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
assert
response
[
"usage"
][
"prompt_tokens"
]
==
truncation_size
@
pytest
.
mark
.
asyncio
async
def
test_zero_truncation_size
(
client
:
openai
.
AsyncOpenAI
):
truncation_size
=
0
kwargs
:
dict
[
str
,
Any
]
=
{
"model"
:
MODEL_NAME
,
"input"
:
input
,
"truncate_prompt_tokens"
:
truncation_size
}
with
pytest
.
raises
(
openai
.
BadRequestError
)
as
err
:
await
client
.
post
(
path
=
"embeddings"
,
cast_to
=
object
,
body
=
{
**
kwargs
})
assert
err
.
value
.
status_code
==
400
error_details
=
err
.
value
.
response
.
json
()[
"error"
]
assert
error_details
[
"type"
]
==
"BadRequestError"
assert
"This model's maximum context length is"
in
error_details
[
"message"
]
assert
"tokens in the input for embedding generation"
in
error_details
[
"message"
]
assert
"Please reduce the length of the input"
in
error_details
[
"message"
]
@
pytest
.
mark
.
asyncio
async
def
test_bigger_truncation_size
(
client
:
openai
.
AsyncOpenAI
):
truncation_size
=
max_model_len
+
1
...
...
@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
}
with
pytest
.
raises
(
openai
.
BadRequestError
)
as
err
:
err
=
await
client
.
post
(
path
=
"embeddings"
,
cast_to
=
object
,
body
=
{
**
kwargs
})
assert
str
(
err
)
==
f
"""openai.BadRequestError:
Error code: 400 - {{'object': 'error',
'message': 'truncate_prompt_tokens value
(
{
truncation_size
}
)
is greater than max_model_len (
{
max_model_len
}
).
Please, select a smaller truncation size.',
'type': 'BadRequestError',
'param': None, 'code': 400}}"""
await
client
.
post
(
path
=
"embeddings"
,
cast_to
=
object
,
body
=
{
**
kwargs
})
assert
err
.
value
.
status_code
==
400
error_details
=
err
.
value
.
response
.
json
()[
"error"
]
assert
error_details
[
"type"
]
==
"BadRequestError"
expected_message
=
(
"truncate_prompt_tokens value is "
"greater than max_model_len."
" Please, select a smaller truncation size."
)
assert
error_details
[
"message"
]
==
expected_message
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_vision.py
View file @
d2b52805
...
...
@@ -6,8 +6,6 @@ import json
import
openai
import
pytest
import
pytest_asyncio
import
requests
from
PIL
import
Image
from
transformers
import
AutoProcessor
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
...
...
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}{
content
}
"
,
}]
images
=
[
Image
.
open
(
requests
.
ge
t
(
image_url
,
stream
=
True
).
raw
)]
images
=
[
fetch_ima
ge
(
image_url
)]
prompt
=
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
...
...
tests/entrypoints/openai/test_vision_embedding.py
View file @
d2b52805
...
...
@@ -5,7 +5,6 @@ import json
import
pytest
import
requests
from
PIL
import
Image
from
transformers
import
AutoProcessor
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
...
...
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder
=
"<|image_1|> "
prompt
=
f
"
{
placeholder
}{
content
}
"
images
=
[
Image
.
open
(
requests
.
ge
t
(
image_url
,
stream
=
True
).
raw
)]
images
=
[
fetch_ima
ge
(
image_url
)]
inputs
=
processor
(
prompt
,
images
,
return_tensors
=
"pt"
)
return
inputs
.
input_ids
.
shape
[
1
]
...
...
tests/evals/gsm8k/README.md
0 → 100644
View file @
d2b52805
# GSM8K Accuracy Evaluation
This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
## Usage
### Run tests with pytest (like buildkite)
```
bash
pytest
-s
-v
tests/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
\
--tp-size
=
1
```
### Run standalone evaluation script
```
bash
# Start vLLM server first
vllm serve Qwen/Qwen2.5-1.5B-Instruct
--port
8000
# Run evaluation
python tests/gsm8k/gsm8k_eval.py
--port
8000
```
## Configuration Format
Model configs in
`configs/`
directory use this YAML format:
```
yaml
model_name
:
"
Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold
:
0.54
# Minimum expected accuracy
num_questions
:
1319
# Number of questions (default: full test set)
num_fewshot
:
5
# Few-shot examples from train set
max_model_len
:
4096
# Model context length
```
tests/evals/gsm8k/__init__.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold
:
0.74
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold
:
0.31
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold
:
0.45
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold
:
0.60
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
Qwen/Qwen3-0.6B-FP8"
accuracy_threshold
:
0.375
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/models-small.txt
0 → 100644
View file @
d2b52805
Qwen3-0.6B-FP8.yaml
Llama-3.2-1B-Instruct-INT8-CT.yaml
Llama-3-8B-Instruct-nonuniform-CT.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Prev
1
…
7
8
9
10
11
12
13
14
15
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment