Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
501
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
720 additions
and
82 deletions
+720
-82
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+94
-1
tests/entrypoints/openai/test_openai_schema.py
tests/entrypoints/openai/test_openai_schema.py
+38
-25
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+0
-8
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+28
-17
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_token_ids.py
+374
-0
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+0
-9
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+3
-1
tests/entrypoints/openai/test_token_in_token_out.py
tests/entrypoints/openai/test_token_in_token_out.py
+73
-0
tests/entrypoints/openai/test_transcription_validation.py
tests/entrypoints/openai/test_transcription_validation.py
+10
-4
tests/entrypoints/openai/test_truncation.py
tests/entrypoints/openai/test_truncation.py
+31
-12
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+1
-3
tests/entrypoints/openai/test_vision_embedding.py
tests/entrypoints/openai/test_vision_embedding.py
+1
-2
tests/evals/gsm8k/README.md
tests/evals/gsm8k/README.md
+35
-0
tests/evals/gsm8k/__init__.py
tests/evals/gsm8k/__init__.py
+2
-0
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
...vals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...als/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+5
-0
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+5
-0
tests/evals/gsm8k/configs/models-small.txt
tests/evals/gsm8k/configs/models-small.txt
+5
-0
No files found.
Too many changes to show.
To preserve performance only
501 of 501+
files are displayed.
Plain diff
Email patch
tests/entrypoints/openai/test_metrics.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
subprocess
import
subprocess
import
sys
import
sys
import
tempfile
import
tempfile
...
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
...
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
metric
in
response
.
text
assert
metric
in
response
.
text
@
pytest
.
mark
.
asyncio
async
def
test_abort_metrics_reset
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
):
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
# Expect no running requests or kvcache usage
assert
running_requests
==
0
assert
waiting_requests
==
0
assert
kv_cache_usage
==
0.0
# Start some long-running requests that we can abort
tasks
=
[]
for
_
in
range
(
3
):
task
=
asyncio
.
create_task
(
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
_TOKENIZED_PROMPT
,
max_tokens
=
100
,
# Long generation to give time to abort
temperature
=
0.0
))
tasks
.
append
(
task
)
# Wait a bit for requests to start processing
await
asyncio
.
sleep
(
0.5
)
# Check that we have running requests
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
# Expect running requests and kvcache usage
assert
running_requests
>
0
assert
kv_cache_usage
>
0
# Cancel all tasks to abort the requests
for
task
in
tasks
:
task
.
cancel
()
# Wait for cancellations to be processed
await
asyncio
.
sleep
(
1.0
)
# Check that metrics have reset to zero
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
_get_running_metrics_from_api
(
server
))
assert
running_requests_after
==
0
,
\
(
f
"Expected 0 running requests after abort, got "
f
"
{
running_requests_after
}
"
)
assert
waiting_requests_after
==
0
,
\
(
f
"Expected 0 waiting requests after abort, got "
f
"
{
waiting_requests_after
}
"
)
assert
kv_cache_usage_after
==
0
,
\
(
f
"Expected 0% KV cache usage after abort, got "
f
"
{
kv_cache_usage_after
}
"
)
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
# Verify running and waiting requests counts and KV cache usage are zero
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
"vllm:num_requests_running"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:num_requests_running"
:
running_requests
=
sample
.
value
break
elif
family
.
name
==
"vllm:num_requests_waiting"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:num_requests_waiting"
:
waiting_requests
=
sample
.
value
break
elif
family
.
name
==
"vllm:gpu_cache_usage_perc"
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:gpu_cache_usage_perc"
:
kv_cache_usage
=
sample
.
value
break
assert
running_requests
is
not
None
assert
waiting_requests
is
not
None
assert
kv_cache_usage
is
not
None
return
running_requests
,
waiting_requests
,
kv_cache_usage
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""
# noqa: E501
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""
# noqa: E501
...
...
tests/entrypoints/openai/test_openai_schema.py
View file @
d2b52805
...
@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
...
@@ -74,31 +74,44 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}'
\
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}'
\
http://localhost:8000/v1/chat/completions
http://localhost:8000/v1/chat/completions
"""
# noqa: E501
"""
# noqa: E501
if
(
hasattr
(
case
,
"body"
)
and
isinstance
(
case
.
body
,
dict
)
if
hasattr
(
case
,
"body"
)
and
isinstance
(
case
.
body
,
dict
):
and
"messages"
in
case
.
body
if
(
"messages"
in
case
.
body
and
isinstance
(
case
.
body
[
"messages"
],
list
)
and
isinstance
(
case
.
body
[
"messages"
],
list
)
and
len
(
case
.
body
[
"messages"
])
>
0
):
and
len
(
case
.
body
[
"messages"
])
>
0
):
for
message
in
case
.
body
[
"messages"
]:
for
message
in
case
.
body
[
"messages"
]:
if
not
isinstance
(
message
,
dict
):
if
not
isinstance
(
message
,
dict
):
continue
continue
# Check for invalid file type in tokenize endpoint
# Check for invalid file type in tokenize endpoint
if
op
.
method
.
lower
()
==
"post"
and
op
.
path
==
"/tokenize"
:
if
op
.
method
.
lower
()
==
"post"
and
op
.
path
==
"/tokenize"
:
content
=
message
.
get
(
"content"
,
[])
content
=
message
.
get
(
"content"
,
[])
if
(
isinstance
(
content
,
list
)
and
len
(
content
)
>
0
and
any
(
if
(
isinstance
(
content
,
list
)
and
len
(
content
)
>
0
item
.
get
(
"type"
)
==
"file"
for
item
in
content
)):
and
any
(
return
False
item
.
get
(
"type"
)
==
"file"
for
item
in
content
)):
# Check for invalid tool_calls with non-function types
return
False
tool_calls
=
message
.
get
(
"tool_calls"
,
[])
if
isinstance
(
tool_calls
,
list
):
# Check for invalid tool_calls with non-function types
for
tool_call
in
tool_calls
:
tool_calls
=
message
.
get
(
"tool_calls"
,
[])
if
isinstance
(
tool_call
,
dict
):
if
isinstance
(
tool_calls
,
list
):
if
tool_call
.
get
(
"type"
)
!=
"function"
:
for
tool_call
in
tool_calls
:
return
False
if
isinstance
(
tool_call
,
dict
):
if
"custom"
in
tool_call
:
if
tool_call
.
get
(
"type"
)
!=
"function"
:
return
False
return
False
if
"custom"
in
tool_call
:
return
False
# Sometimes guided_grammar is generated to be empty
# Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
guided_grammar
=
case
.
body
.
get
(
"guided_grammar"
)
if
guided_grammar
==
''
:
# Allow None (will be handled as no grammar)
# But skip empty strings
return
False
return
True
return
True
return
strategy
.
filter
(
no_invalid_types
)
return
strategy
.
filter
(
no_invalid_types
)
...
...
tests/entrypoints/openai/test_rerank.py
View file @
d2b52805
...
@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
...
@@ -14,14 +14,6 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
...
...
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
d2b52805
...
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
...
@@ -11,18 +11,25 @@ from openai import BadRequestError, NotFoundError, OpenAI
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
pytest
.
skip
(
allow_module_level
=
True
,
reason
=
"gpt-oss can't run on CI yet."
)
MODEL_NAME
=
"openai/gpt-oss-20b"
MODEL_NAME
=
"openai/gpt-oss-20b"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
):
args
=
[
"--enforce-eager"
,
"--tool-server"
,
"demo"
]
args
=
[
"--enforce-eager"
,
"--tool-server"
,
"demo"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
monkeypatch_module
.
context
()
as
m
:
yield
remote_server
m
.
setenv
(
"VLLM_ENABLE_RESPONSES_API_STORE"
,
"1"
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
@
pytest_asyncio
.
fixture
...
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
...
@@ -269,10 +276,11 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
):
# TODO: Add back when web search and code interpreter are available in CI
prompts
=
[
prompts
=
[
"tell me a story about a cat in 20 words"
,
"tell me a story about a cat in 20 words"
,
"What is 13 * 24? Use python to calculate the result."
,
#
"What is 13 * 24? Use python to calculate the result.",
"When did Jensen found NVIDIA? Search it and answer the year only."
,
#
"When did Jensen found NVIDIA? Search it and answer the year only.",
]
]
for
prompt
in
prompts
:
for
prompt
in
prompts
:
...
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
...
@@ -281,15 +289,15 @@ async def test_streaming(client: OpenAI, model_name: str):
input
=
prompt
,
input
=
prompt
,
reasoning
=
{
"effort"
:
"low"
},
reasoning
=
{
"effort"
:
"low"
},
tools
=
[
tools
=
[
{
#
{
"type"
:
"web_search_preview"
#
"type": "web_search_preview"
},
#
},
{
#
{
"type"
:
"code_interpreter"
,
#
"type": "code_interpreter",
"container"
:
{
#
"container": {
"type"
:
"auto"
#
"type": "auto"
}
#
}
},
#
},
],
],
stream
=
True
,
stream
=
True
,
)
)
...
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
...
@@ -317,6 +325,7 @@ async def test_streaming(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"Web search tool is not available in CI yet."
)
async
def
test_web_search
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_web_search
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
model
=
model_name
,
...
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
...
@@ -331,6 +340,7 @@ async def test_web_search(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"Code interpreter tool is not available in CI yet."
)
async
def
test_code_interpreter
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_code_interpreter
(
client
:
OpenAI
,
model_name
:
str
):
response
=
await
client
.
responses
.
create
(
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
model
=
model_name
,
...
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
...
@@ -436,6 +446,7 @@ async def test_function_calling(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
flaky
(
reruns
=
5
)
async
def
test_function_calling_multi_turn
(
client
:
OpenAI
,
model_name
:
str
):
async
def
test_function_calling_multi_turn
(
client
:
OpenAI
,
model_name
:
str
):
tools
=
[
tools
=
[
{
{
...
...
tests/entrypoints/openai/test_return_token_ids.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen2.5-1.5B-Instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"hermes"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_basic_completion_with_emoji
(
server
):
"""Test basic completion with emoji to verify token_ids field."""
async
with
server
.
get_async_client
()
as
client
:
# Test with return_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Complete this sentence with emojis: I love coding 🚀"
,
max_tokens
=
10
,
temperature
=
0
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Check the raw response to see the structure
completion_dict
=
completion
.
model_dump
()
# Verify prompt_token_ids field is present in the completion response
assert
"prompt_token_ids"
in
completion_dict
[
"choices"
][
0
]
assert
isinstance
(
completion
.
choices
[
0
].
prompt_token_ids
,
list
)
# Check against the expected prompt token IDs
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
encoded_tokens
=
tokenizer
.
encode
(
"Complete this sentence with emojis: I love coding 🚀"
)
# Check that encoded_tokens is a subsequence of prompt_token_ids
assert
any
(
completion
.
choices
[
0
].
prompt_token_ids
[
i
:
i
+
len
(
encoded_tokens
)]
==
encoded_tokens
for
i
in
range
(
len
(
completion
.
choices
[
0
].
prompt_token_ids
)
-
len
(
encoded_tokens
)
+
1
))
# Verify token_ids field is present in the choice
assert
completion
.
choices
[
0
].
token_ids
is
not
None
assert
isinstance
(
completion
.
choices
[
0
].
token_ids
,
list
)
assert
len
(
completion
.
choices
[
0
].
token_ids
)
>
0
# Verify decoding works correctly
decoded_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
token_ids
)
# The decoded text should contain a <|im_end|> at the end
assert
decoded_text
.
startswith
(
completion
.
choices
[
0
].
text
)
# Test without return_token_ids (should be None)
completion_without
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Complete this sentence with emojis: I love coding 🚀"
,
max_tokens
=
10
,
temperature
=
0
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
False
},
)
completion_without_dict
=
completion_without
.
model_dump
()
assert
completion_without_dict
[
"choices"
][
0
].
get
(
"token_ids"
)
is
None
assert
completion_without_dict
.
get
(
"prompt_token_ids"
)
is
None
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_with_tool_use
(
server
):
"""Test chat completion with tool use (get_weather function)."""
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"location"
:
{
"type"
:
"string"
,
"description"
:
"The city and state, e.g. San Francisco, CA"
,
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
"description"
:
"The unit of temperature"
,
},
},
"required"
:
[
"location"
],
},
},
}]
async
with
server
.
get_async_client
()
as
client
:
# Test with return_token_ids enabled
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris?"
},
],
tools
=
tools
,
tool_choice
=
"auto"
,
max_tokens
=
100
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Verify token_ids field is present in choices
assert
response
.
choices
[
0
].
token_ids
is
not
None
assert
isinstance
(
response
.
choices
[
0
].
token_ids
,
list
)
# Verify prompt_token_ids field is present
assert
response
.
prompt_token_ids
is
not
None
assert
isinstance
(
response
.
prompt_token_ids
,
list
)
# Verify the prompt texts and response texts
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
prompt_text
=
tokenizer
.
decode
(
response
.
prompt_token_ids
)
assert
prompt_text
.
startswith
(
"<|im_start|>system
\n
You are a helpful assistant."
)
assert
prompt_text
.
endswith
(
"What's the weather like in Paris?<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
response_text
=
tokenizer
.
decode
(
response
.
choices
[
0
].
token_ids
)
assert
response_text
.
startswith
(
'<tool_call>
\n
{"name": "get_weather"'
)
assert
response_text
.
endswith
(
"</tool_call><|im_end|>"
)
# If tool call was made, verify the response structure
if
response
.
choices
[
0
].
message
.
tool_calls
:
assert
len
(
response
.
choices
[
0
].
message
.
tool_calls
)
>
0
tool_call
=
response
.
choices
[
0
].
message
.
tool_calls
[
0
]
assert
tool_call
.
function
.
name
==
"get_weather"
# Test without return_token_ids
response_without
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris?"
},
],
tools
=
tools
,
tool_choice
=
"auto"
,
max_tokens
=
100
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
False
},
)
assert
response_without
.
choices
[
0
].
token_ids
is
None
assert
response_without
.
prompt_token_ids
is
None
@
pytest
.
mark
.
asyncio
async
def
test_comparison_with_prompt_logprobs_and_logprobs
(
server
):
"""
Test that token_ids align with prompt_logprobs and
logprobs when return_tokens_as_token_ids is enabled.
"""
async
with
server
.
get_async_client
()
as
client
:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Hello, world! How are you today?"
,
max_tokens
=
20
,
temperature
=
0
,
echo
=
True
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
,
"return_tokens_as_token_ids"
:
True
,
"prompt_logprobs"
:
1
},
)
# Verify all fields are present
assert
completion
.
choices
[
0
].
token_ids
is
not
None
assert
completion
.
choices
[
0
].
prompt_token_ids
is
not
None
assert
completion
.
choices
[
0
].
prompt_logprobs
is
not
None
assert
completion
.
choices
[
0
].
logprobs
is
not
None
# Extract token IDs from logprobs
# (when return_tokens_as_token_ids is True)
logprobs_token_ids
=
[]
for
token_str
in
completion
.
choices
[
0
].
logprobs
.
tokens
:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if
token_str
.
startswith
(
"token_id:"
):
token_id
=
int
(
token_str
.
removeprefix
(
"token_id:"
))
logprobs_token_ids
.
append
(
token_id
)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
assert
logprobs_token_ids
[
-
response_token_ids_length
:]
==
\
completion
.
choices
[
0
].
token_ids
# Verify tokenizer consistency
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Decode prompt tokens
if
completion
.
choices
[
0
].
prompt_token_ids
:
prompt_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
prompt_token_ids
)
# The decoded prompt should match or close to original prompt
assert
"Hello, world"
in
prompt_text
# Decode response tokens
if
completion
.
choices
[
0
].
token_ids
:
response_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
token_ids
)
assert
completion
.
choices
[
0
].
text
.
endswith
(
response_text
)
# Test streaming mode
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Tell me a short fact about Python:"
,
max_tokens
=
30
,
temperature
=
0
,
stream
=
True
,
echo
=
False
,
logprobs
=
1
,
extra_body
=
{
"return_token_ids"
:
True
,
"return_tokens_as_token_ids"
:
True
},
)
# Collect streamed tokens
streamed_prompt_token_ids
=
[]
streamed_token_ids
=
[]
streamed_logprob_token_ids
=
[]
first_chunk
=
True
async
for
chunk
in
stream
:
for
token_str
in
chunk
.
choices
[
0
].
logprobs
.
tokens
:
# Token format is "token_id:12345" when
# return_tokens_as_token_ids is True
if
token_str
.
startswith
(
"token_id:"
):
token_id
=
int
(
token_str
.
removeprefix
(
"token_id:"
))
streamed_logprob_token_ids
.
append
(
token_id
)
if
first_chunk
:
streamed_prompt_token_ids
=
chunk
.
choices
[
0
].
prompt_token_ids
first_chunk
=
False
streamed_token_ids
+=
chunk
.
choices
[
0
].
token_ids
# Verify we collected some tokens and first chunk had prompt_token_ids
assert
len
(
streamed_prompt_token_ids
)
>
0
assert
streamed_token_ids
==
streamed_logprob_token_ids
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_with_emoji_and_token_ids
(
server
):
"""Test chat completion with emojis to verify token_ids handling."""
chat_messages
=
[
{
"role"
:
"system"
,
"content"
:
"You like to use emojis in your responses."
},
{
"role"
:
"user"
,
"content"
:
"Repeat after me: I love cats 🐱"
},
]
async
with
server
.
get_async_client
()
as
client
:
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
chat_messages
,
max_tokens
=
50
,
temperature
=
0
,
logprobs
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
# Verify token_ids are present
response_dict
=
response
.
model_dump
()
assert
response
.
choices
[
0
].
token_ids
is
not
None
assert
"prompt_token_ids"
in
response_dict
# Verify the response contains the expected fields
assert
response
.
choices
[
0
].
message
.
content
is
not
None
# Decode token_ids and verify consistency
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
decoded_prompt
=
tokenizer
.
decode
(
response
.
prompt_token_ids
)
assert
decoded_prompt
.
startswith
(
"<|im_start|>system
\n
You like to use emojis in your responses."
)
assert
decoded_prompt
.
endswith
(
"I love cats 🐱<|im_end|>
\n
<|im_start|>assistant
\n
"
)
decoded_response
=
tokenizer
.
decode
(
response
.
choices
[
0
].
token_ids
)
# The content should match the response text
# except the ending <|im_end|>
assert
decoded_response
==
response
.
choices
[
0
].
message
.
content
+
"<|im_end|>"
# Test with streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
chat_messages
,
max_tokens
=
50
,
temperature
=
0
,
stream
=
True
,
extra_body
=
{
"return_token_ids"
:
True
},
)
collected_content
=
""
collected_token_ids
=
[]
first_chunk
=
True
async
for
chunk
in
stream
:
if
first_chunk
:
assert
chunk
.
prompt_token_ids
is
not
None
assert
isinstance
(
chunk
.
prompt_token_ids
,
list
)
# Check the prompt_token_ids match the initial prompt
decoded_prompt_stream
=
tokenizer
.
decode
(
chunk
.
prompt_token_ids
)
assert
decoded_prompt_stream
==
decoded_prompt
first_chunk
=
False
else
:
chunk_dump
=
chunk
.
model_dump
()
assert
"prompt_token_ids"
not
in
chunk_dump
,
\
"Subsequent chunks should not have prompt_token_ids"
if
chunk
.
choices
:
if
chunk
.
choices
[
0
].
delta
.
content
:
collected_content
+=
chunk
.
choices
[
0
].
delta
.
content
# token_ids may not present in all chunks
choice_dump
=
chunk
.
choices
[
0
].
model_dump
()
if
"token_ids"
in
choice_dump
:
collected_token_ids
.
extend
(
chunk
.
choices
[
0
].
token_ids
)
# Verify we got response and token_ids
assert
len
(
collected_content
)
>
0
assert
len
(
collected_token_ids
)
>
0
# Verify token_ids decode properly
decoded_response
=
tokenizer
.
decode
(
collected_token_ids
)
assert
decoded_response
==
collected_content
+
"<|im_end|>"
tests/entrypoints/openai/test_score.py
View file @
d2b52805
...
@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
...
@@ -12,15 +12,6 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS
=
[
MODELS
=
[
{
{
"name"
:
"BAAI/bge-reranker-v2-m3"
,
"name"
:
"BAAI/bge-reranker-v2-m3"
,
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
d2b52805
...
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
...
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
repetition_penalty
==
1.05
assert
mock_engine
.
generate
.
call_args
.
args
[
1
].
repetition_penalty
==
1.05
@
pytest
.
mark
.
parametrize
(
"model_type"
,
[
"gpt_oss"
,
"any"
])
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_did_set_correct_cache_salt
():
async
def
test_serving_chat_did_set_correct_cache_salt
(
model_type
):
mock_model_config
=
MockModelConfig
()
mock_model_config
=
MockModelConfig
()
mock_model_config
.
hf_config
.
model_type
=
model_type
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
...
...
tests/entrypoints/openai/test_token_in_token_out.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
tempfile
import
pytest
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
)
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
MODEL_PATH
=
os
.
path
.
join
(
tempfile
.
gettempdir
(),
"qwen3_06b"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
global
MODEL_PATH
MODEL_PATH
=
download_weights_from_hf
(
MODEL_NAME
,
allow_patterns
=
[
"*"
],
cache_dir
=
MODEL_PATH
,
ignore_patterns
=
[
"tokenizer*"
,
"vocab*"
,
"*.safetensors"
])
args
=
[
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
"--skip-tokenizer-init"
,
"--load-format"
,
"dummy"
,
]
with
RemoteOpenAIServer
(
MODEL_PATH
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_token_in_token_out_and_logprobs
(
server
):
"""
Test token-in-token-out and token_ids align with prompt_logprobs
& logprobs when return_tokens_as_token_ids is enabled.
"""
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
text
=
"Hello, world! How are you today?"
token_ids
=
tokenizer
.
encode
(
text
)
async
with
server
.
get_async_client
()
as
client
:
# Test with both return_token_ids and return_tokens_as_token_ids enabled
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_PATH
,
prompt
=
token_ids
,
max_tokens
=
20
,
temperature
=
0
,
echo
=
True
,
extra_body
=
{
"return_token_ids"
:
True
,
},
)
# Verify all fields are present
assert
(
completion
.
choices
[
0
].
token_ids
is
not
None
and
0
<
len
(
completion
.
choices
[
0
].
token_ids
)
<=
20
)
assert
completion
.
choices
[
0
].
prompt_token_ids
is
not
None
# Decode prompt tokens
if
completion
.
choices
[
0
].
prompt_token_ids
:
prompt_text
=
tokenizer
.
decode
(
completion
.
choices
[
0
].
prompt_token_ids
)
# The decoded prompt should match or close to original prompt
assert
prompt_text
==
text
tests/entrypoints/openai/test_transcription_validation.py
View file @
d2b52805
...
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
...
@@ -69,8 +69,11 @@ async def test_basic_audio(mary_had_lamb, model_name):
language
=
"en"
,
language
=
"en"
,
response_format
=
"text"
,
response_format
=
"text"
,
temperature
=
0.0
)
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
out
=
json
.
loads
(
transcription
)
assert
"Mary had a little lamb,"
in
out
out_text
=
out
[
'text'
]
out_usage
=
out
[
'usage'
]
assert
"Mary had a little lamb,"
in
out_text
assert
out_usage
[
"seconds"
]
==
16
,
out_usage
[
"seconds"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
...
@@ -116,9 +119,12 @@ async def test_long_audio_request(mary_had_lamb, client):
language
=
"en"
,
language
=
"en"
,
response_format
=
"text"
,
response_format
=
"text"
,
temperature
=
0.0
)
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
out
=
json
.
loads
(
transcription
)
counts
=
out
.
count
(
"Mary had a little lamb"
)
out_text
=
out
[
'text'
]
out_usage
=
out
[
'usage'
]
counts
=
out_text
.
count
(
"Mary had a little lamb"
)
assert
counts
==
10
,
counts
assert
counts
==
10
,
counts
assert
out_usage
[
"seconds"
]
==
161
,
out_usage
[
"seconds"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_truncation.py
View file @
d2b52805
...
@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
...
@@ -64,6 +64,28 @@ async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
assert
response
[
"usage"
][
"prompt_tokens"
]
==
truncation_size
assert
response
[
"usage"
][
"prompt_tokens"
]
==
truncation_size
@
pytest
.
mark
.
asyncio
async
def
test_zero_truncation_size
(
client
:
openai
.
AsyncOpenAI
):
truncation_size
=
0
kwargs
:
dict
[
str
,
Any
]
=
{
"model"
:
MODEL_NAME
,
"input"
:
input
,
"truncate_prompt_tokens"
:
truncation_size
}
with
pytest
.
raises
(
openai
.
BadRequestError
)
as
err
:
await
client
.
post
(
path
=
"embeddings"
,
cast_to
=
object
,
body
=
{
**
kwargs
})
assert
err
.
value
.
status_code
==
400
error_details
=
err
.
value
.
response
.
json
()[
"error"
]
assert
error_details
[
"type"
]
==
"BadRequestError"
assert
"This model's maximum context length is"
in
error_details
[
"message"
]
assert
"tokens in the input for embedding generation"
in
error_details
[
"message"
]
assert
"Please reduce the length of the input"
in
error_details
[
"message"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_bigger_truncation_size
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_bigger_truncation_size
(
client
:
openai
.
AsyncOpenAI
):
truncation_size
=
max_model_len
+
1
truncation_size
=
max_model_len
+
1
...
@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
...
@@ -74,18 +96,15 @@ async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
}
}
with
pytest
.
raises
(
openai
.
BadRequestError
)
as
err
:
with
pytest
.
raises
(
openai
.
BadRequestError
)
as
err
:
err
=
await
client
.
post
(
path
=
"embeddings"
,
await
client
.
post
(
path
=
"embeddings"
,
cast_to
=
object
,
body
=
{
**
kwargs
})
cast_to
=
object
,
body
=
{
**
kwargs
})
assert
err
.
value
.
status_code
==
400
error_details
=
err
.
value
.
response
.
json
()[
"error"
]
assert
str
(
err
)
==
f
"""openai.BadRequestError:
assert
error_details
[
"type"
]
==
"BadRequestError"
Error code: 400 - {{'object': 'error',
expected_message
=
(
"truncate_prompt_tokens value is "
'message': 'truncate_prompt_tokens value
"greater than max_model_len."
(
{
truncation_size
}
)
" Please, select a smaller truncation size."
)
is greater than max_model_len (
{
max_model_len
}
).
assert
error_details
[
"message"
]
==
expected_message
Please, select a smaller truncation size.',
'type': 'BadRequestError',
'param': None, 'code': 400}}"""
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_vision.py
View file @
d2b52805
...
@@ -6,8 +6,6 @@ import json
...
@@ -6,8 +6,6 @@ import json
import
openai
import
openai
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
import
requests
from
PIL
import
Image
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
...
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
...
@@ -88,7 +86,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}{
content
}
"
,
"content"
:
f
"
{
placeholder
}{
content
}
"
,
}]
}]
images
=
[
Image
.
open
(
requests
.
ge
t
(
image_url
,
stream
=
True
).
raw
)]
images
=
[
fetch_ima
ge
(
image_url
)]
prompt
=
processor
.
tokenizer
.
apply_chat_template
(
prompt
=
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
...
...
tests/entrypoints/openai/test_vision_embedding.py
View file @
d2b52805
...
@@ -5,7 +5,6 @@ import json
...
@@ -5,7 +5,6 @@ import json
import
pytest
import
pytest
import
requests
import
requests
from
PIL
import
Image
from
transformers
import
AutoProcessor
from
transformers
import
AutoProcessor
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
from
vllm.entrypoints.openai.protocol
import
EmbeddingResponse
...
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
...
@@ -64,7 +63,7 @@ def get_hf_prompt_tokens(model_name, content, image_url):
placeholder
=
"<|image_1|> "
placeholder
=
"<|image_1|> "
prompt
=
f
"
{
placeholder
}{
content
}
"
prompt
=
f
"
{
placeholder
}{
content
}
"
images
=
[
Image
.
open
(
requests
.
ge
t
(
image_url
,
stream
=
True
).
raw
)]
images
=
[
fetch_ima
ge
(
image_url
)]
inputs
=
processor
(
prompt
,
images
,
return_tensors
=
"pt"
)
inputs
=
processor
(
prompt
,
images
,
return_tensors
=
"pt"
)
return
inputs
.
input_ids
.
shape
[
1
]
return
inputs
.
input_ids
.
shape
[
1
]
...
...
tests/evals/gsm8k/README.md
0 → 100644
View file @
d2b52805
# GSM8K Accuracy Evaluation
This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
## Usage
### Run tests with pytest (like buildkite)
```
bash
pytest
-s
-v
tests/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
\
--tp-size
=
1
```
### Run standalone evaluation script
```
bash
# Start vLLM server first
vllm serve Qwen/Qwen2.5-1.5B-Instruct
--port
8000
# Run evaluation
python tests/gsm8k/gsm8k_eval.py
--port
8000
```
## Configuration Format
Model configs in
`configs/`
directory use this YAML format:
```
yaml
model_name
:
"
Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold
:
0.54
# Minimum expected accuracy
num_questions
:
1319
# Number of questions (default: full test set)
num_fewshot
:
5
# Few-shot examples from train set
max_model_len
:
4096
# Model context length
```
tests/evals/gsm8k/__init__.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold
:
0.74
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold
:
0.31
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold
:
0.45
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold
:
0.60
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
0 → 100644
View file @
d2b52805
model_name
:
"
Qwen/Qwen3-0.6B-FP8"
accuracy_threshold
:
0.375
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
tests/evals/gsm8k/configs/models-small.txt
0 → 100644
View file @
d2b52805
Qwen3-0.6B-FP8.yaml
Llama-3.2-1B-Instruct-INT8-CT.yaml
Llama-3-8B-Instruct-nonuniform-CT.yaml
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Prev
1
…
7
8
9
10
11
12
13
14
15
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment