Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a810671a
Commit
a810671a
authored
Jan 08, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori
parents
86b5aefe
6a09612b
Changes
291
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
352 additions
and
26 deletions
+352
-26
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+134
-1
tests/entrypoints/instrumentator/__init__.py
tests/entrypoints/instrumentator/__init__.py
+0
-0
tests/entrypoints/instrumentator/test_metrics.py
tests/entrypoints/instrumentator/test_metrics.py
+2
-3
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+1
-0
tests/entrypoints/openai/test_completion_error.py
tests/entrypoints/openai/test_completion_error.py
+1
-0
tests/entrypoints/openai/test_response_api_parsable_context.py
.../entrypoints/openai/test_response_api_parsable_context.py
+6
-0
tests/entrypoints/openai/test_response_api_simple.py
tests/entrypoints/openai/test_response_api_simple.py
+45
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+82
-10
tests/entrypoints/openai/test_transcription_validation_whisper.py
...trypoints/openai/test_transcription_validation_whisper.py
+32
-0
tests/entrypoints/openai/test_translation_validation.py
tests/entrypoints/openai/test_translation_validation.py
+33
-0
tests/entrypoints/rpc/__init__.py
tests/entrypoints/rpc/__init__.py
+0
-0
tests/entrypoints/rpc/test_collective_rpc.py
tests/entrypoints/rpc/test_collective_rpc.py
+1
-1
tests/entrypoints/sleep/__init__.py
tests/entrypoints/sleep/__init__.py
+0
-0
tests/entrypoints/sleep/test_sleep.py
tests/entrypoints/sleep/test_sleep.py
+1
-1
tests/evals/gsm8k/README.md
tests/evals/gsm8k/README.md
+9
-4
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+1
-2
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
...vals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...als/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+1
-1
No files found.
tests/engine/test_arg_utils.py
View file @
a810671a
...
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
...
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import
pytest
import
pytest
from
vllm.config
import
CompilationConfig
,
config
from
vllm.config
import
AttentionConfig
,
CompilationConfig
,
config
from
vllm.engine.arg_utils
import
(
from
vllm.engine.arg_utils
import
(
EngineArgs
,
EngineArgs
,
contains_type
,
contains_type
,
...
@@ -298,6 +298,139 @@ def test_compilation_config():
...
@@ -298,6 +298,139 @@ def test_compilation_config():
)
)
def
test_attention_config
():
from
vllm.attention.backends.registry
import
AttentionBackendEnum
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
# default value
args
=
parser
.
parse_args
([])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
==
AttentionConfig
()
# set backend via dot notation
args
=
parser
.
parse_args
([
"--attention-config.backend"
,
"FLASH_ATTN"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
# set backend via --attention-backend shorthand
args
=
parser
.
parse_args
([
"--attention-backend"
,
"FLASHINFER"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_backend
is
not
None
assert
engine_args
.
attention_backend
==
"FLASHINFER"
# set all fields via dot notation
args
=
parser
.
parse_args
(
[
"--attention-config.backend"
,
"FLASH_ATTN"
,
"--attention-config.flash_attn_version"
,
"3"
,
"--attention-config.use_prefill_decode_attention"
,
"true"
,
"--attention-config.flash_attn_max_num_splits_for_cuda_graph"
,
"16"
,
"--attention-config.use_cudnn_prefill"
,
"true"
,
"--attention-config.use_trtllm_ragged_deepseek_prefill"
,
"true"
,
"--attention-config.use_trtllm_attention"
,
"true"
,
"--attention-config.disable_flashinfer_prefill"
,
"true"
,
"--attention-config.disable_flashinfer_q_quantization"
,
"true"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
assert
engine_args
.
attention_config
.
flash_attn_version
==
3
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
True
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
16
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
True
# set to string form of a dict with all fields
args
=
parser
.
parse_args
(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}'
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASHINFER"
assert
engine_args
.
attention_config
.
flash_attn_version
==
2
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
False
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
8
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
False
# test --attention-backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
with
pytest
.
raises
(
ValueError
,
match
=
"mutually exclusive"
):
engine_args
.
create_engine_config
()
def
test_prefix_cache_default
():
def
test_prefix_cache_default
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
args
=
parser
.
parse_args
([])
args
=
parser
.
parse_args
([])
...
...
tests/entrypoints/instrumentator/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_metrics.py
→
tests/entrypoints/
instrumentator
/test_metrics.py
View file @
a810671a
...
@@ -14,11 +14,10 @@ import requests
...
@@ -14,11 +14,10 @@ import requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
from
tests.conftest
import
LocalAssetServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm
import
version
from
vllm
import
version
from
...conftest
import
LocalAssetServer
from
...utils
import
RemoteOpenAIServer
MODELS
=
{
MODELS
=
{
"text"
:
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"text"
:
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"multimodal"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
,
"multimodal"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
,
...
...
tests/entrypoints/openai/test_chat_error.py
View file @
a810671a
...
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
...
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request
,
lora_request
,
trace_headers
,
trace_headers
,
priority
,
priority
,
data_parallel_rank
,
):
):
return
dict
(
engine_prompt
),
{}
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_completion_error.py
View file @
a810671a
...
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
...
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
lora_request
,
lora_request
,
trace_headers
,
trace_headers
,
priority
,
priority
,
data_parallel_rank
,
):
):
return
dict
(
engine_prompt
),
{}
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_response_api_parsable_context.py
View file @
a810671a
...
@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
...
@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model
=
model_name
,
model
=
model_name
,
input
=
"What is 13 * 24? Use python to calculate the result."
,
input
=
"What is 13 * 24? Use python to calculate the result."
,
tools
=
[{
"type"
:
"code_interpreter"
,
"container"
:
{
"type"
:
"auto"
}}],
tools
=
[{
"type"
:
"code_interpreter"
,
"container"
:
{
"type"
:
"auto"
}}],
extra_body
=
{
"enable_response_messages"
:
True
},
temperature
=
0.0
,
temperature
=
0.0
,
)
)
...
@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
...
@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output
# make sure the correct math is in the final output
assert
response
.
output
[
3
].
type
==
"message"
assert
response
.
output
[
3
].
type
==
"message"
assert
"312"
in
response
.
output
[
3
].
content
[
0
].
text
assert
"312"
in
response
.
output
[
3
].
content
[
0
].
text
# test raw input_messages / output_messages
assert
len
(
response
.
input_messages
)
==
1
assert
len
(
response
.
output_messages
)
==
3
assert
"312"
in
response
.
output_messages
[
2
][
"message"
]
tests/entrypoints/openai/test_response_api_simple.py
View file @
a810671a
...
@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
...
@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
assert
response
.
output
[
0
].
type
==
"reasoning"
assert
response
.
output
[
0
].
type
==
"reasoning"
assert
response
.
output
[
1
].
type
==
"message"
assert
response
.
output
[
1
].
type
==
"message"
assert
type
(
response
.
output
[
1
].
content
[
0
].
text
)
is
str
assert
type
(
response
.
output
[
1
].
content
[
0
].
text
)
is
str
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming_output_consistency
(
client
:
OpenAI
,
model_name
:
str
):
"""Test that streaming delta text matches the final response output_text.
This test verifies that when using streaming mode:
1. The concatenated text from all 'response.output_text.delta' events
2. Matches the 'output_text' in the final 'response.completed' event
"""
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
"Say hello in one sentence."
,
stream
=
True
,
)
events
=
[]
async
for
event
in
response
:
events
.
append
(
event
)
assert
len
(
events
)
>
0
# Concatenate all delta text from streaming events
streaming_text
=
""
.
join
(
event
.
delta
for
event
in
events
if
event
.
type
==
"response.output_text.delta"
)
# Get the final response from the last event
response_completed_event
=
events
[
-
1
]
assert
response_completed_event
.
type
==
"response.completed"
assert
response_completed_event
.
response
.
status
==
"completed"
# Get output_text from the final response
final_output_text
=
response_completed_event
.
response
.
output_text
# Verify final response has output
assert
len
(
response_completed_event
.
response
.
output
)
>
0
# Verify streaming text matches final output_text
assert
streaming_text
==
final_output_text
,
(
f
"Streaming text does not match final output_text.
\n
"
f
"Streaming:
{
streaming_text
!
r
}
\n
"
f
"Final:
{
final_output_text
!
r
}
"
)
tests/entrypoints/openai/test_serving_chat.py
View file @
a810671a
...
@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool:
...
@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool:
return
request
.
param
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
],
ids
=
[
"exclude_tools_when_tool_choice_none"
],
)
def
exclude_tools_when_tool_choice_none
(
request
)
->
bool
:
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
with_tool_parser
:
bool
):
def
default_server_args
(
with_tool_parser
:
bool
,
exclude_tools_when_tool_choice_none
:
bool
):
args
=
[
args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--enforce-eager"
,
"--enforce-eager"
,
...
@@ -72,19 +83,16 @@ def default_server_args(with_tool_parser: bool):
...
@@ -72,19 +83,16 @@ def default_server_args(with_tool_parser: bool):
"--enable-auto-tool-choice"
,
"--enable-auto-tool-choice"
,
]
]
)
)
if
exclude_tools_when_tool_choice_none
:
args
.
append
(
"--exclude-tools-when-tool-choice-none"
)
return
args
return
args
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
gptoss_server
(
def
gptoss_server
(
default_server_args
:
list
[
str
]):
monkeypatch_module
:
pytest
.
MonkeyPatch
,
default_server_args
:
list
[
str
]
server_args
=
default_server_args
+
[
"--attention-backend=TRITON_ATTN"
]
):
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
server_args
)
as
remote_server
:
with
monkeypatch_module
.
context
()
as
m
:
yield
remote_server
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
@
pytest_asyncio
.
fixture
...
@@ -340,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content(
...
@@ -340,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content(
assert
response_multi_array
.
choices
[
0
].
message
is
not
None
assert
response_multi_array
.
choices
[
0
].
message
is
not
None
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_tool_choice_none
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
,
exclude_tools_when_tool_choice_none
:
bool
,
):
if
not
(
with_tool_parser
and
exclude_tools_when_tool_choice_none
):
pytest
.
skip
(
"skip tool_choice tests when non-tool or "
"--exclude-tools-when-tool-choice-none not set"
)
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"What's the temperature(in degrees Celsius) in Dallas?"
,
},
]
tool_choice_auto
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
tool_choice
=
"auto"
,
temperature
=
0.0
,
)
msg
=
tool_choice_auto
.
choices
[
0
].
message
assert
len
(
msg
.
tool_calls
)
==
1
tool_choice_none
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
tool_choice
=
"none"
,
temperature
=
0.0
,
)
msg
=
tool_choice_none
.
choices
[
0
].
message
assert
len
(
msg
.
tool_calls
)
==
0
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
MODEL_NAME_SHORT
=
"gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
...
@@ -401,6 +472,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
...
@@ -401,6 +472,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request
,
lora_request
,
trace_headers
,
trace_headers
,
priority
,
priority
,
data_parallel_rank
,
):
):
return
dict
(
engine_prompt
),
{}
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_transcription_validation_whisper.py
View file @
a810671a
...
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
...
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
)
)
assert
transcription
.
segments
is
not
None
assert
transcription
.
segments
is
not
None
assert
len
(
transcription
.
segments
)
>
0
assert
len
(
transcription
.
segments
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_audio_with_max_tokens
(
whisper_client
,
mary_had_lamb
):
transcription
=
await
whisper_client
.
audio
.
transcriptions
.
create
(
model
=
MODEL_NAME
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
1
},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
from
transformers
import
AutoTokenizer
tok
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
==
1
# max_completion_tokens > max_model_len
transcription
=
await
whisper_client
.
audio
.
transcriptions
.
create
(
model
=
MODEL_NAME
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
int
(
1e6
)},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
<
450
# ~Whisper max output len
tests/entrypoints/openai/test_translation_validation.py
View file @
a810671a
...
@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
...
@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
)
)
out
=
json
.
loads
(
translation
)[
"text"
].
strip
().
lower
()
out
=
json
.
loads
(
translation
)[
"text"
].
strip
().
lower
()
assert
out
.
count
(
"greek sea"
)
==
2
assert
out
.
count
(
"greek sea"
)
==
2
@
pytest
.
mark
.
asyncio
async
def
test_audio_with_max_tokens
(
mary_had_lamb
,
client_and_model
):
client
,
model_name
=
client_and_model
transcription
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
1
},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
print
(
out_text
)
from
transformers
import
AutoTokenizer
tok
=
AutoTokenizer
.
from_pretrained
(
model_name
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
==
1
# max_completion_tokens > max_model_len
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
int
(
1e6
)},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
print
(
out_text
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
<
450
# ~Whisper max output len
tests/entrypoints/rpc/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_collective_rpc.py
→
tests/entrypoints/
rpc
/test_collective_rpc.py
View file @
a810671a
...
@@ -37,7 +37,7 @@ def server():
...
@@ -37,7 +37,7 @@ def server():
"--max-num-seqs"
,
"--max-num-seqs"
,
"128"
,
"128"
,
"--worker-extension-cls"
,
"--worker-extension-cls"
,
"tests.entrypoints.
openai
.test_collective_rpc.TestWorkerExtension"
,
"tests.entrypoints.
rpc
.test_collective_rpc.TestWorkerExtension"
,
]
]
with
RemoteOpenAIServer
(
with
RemoteOpenAIServer
(
MODEL_NAME
,
MODEL_NAME
,
...
...
tests/entrypoints/sleep/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_sleep.py
→
tests/entrypoints/
sleep
/test_sleep.py
View file @
a810671a
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
import
requests
import
requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
prometheus_client.parser
import
text_string_to_metric_families
from
..
.utils
import
RemoteOpenAIServer
from
tests
.utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B"
MODEL_NAME
=
"meta-llama/Llama-3.2-1B"
...
...
tests/evals/gsm8k/README.md
View file @
a810671a
...
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
...
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite)
### Run tests with pytest (like buildkite)
```
bash
```
bash
pytest
-s
-v
tests/gsm8k/test_gsm8k_correctness.py
\
pytest
-s
-v
tests/evals/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
\
--config-list-file
=
configs/models-small.txt
--tp-size
=
1
```
```
### Run standalone evaluation script
### Run standalone evaluation script
...
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
...
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold
:
0.54
# Minimum expected accuracy
accuracy_threshold
:
0.54
# Minimum expected accuracy
num_questions
:
1319
# Number of questions (default: full test set)
num_questions
:
1319
# Number of questions (default: full test set)
num_fewshot
:
5
# Few-shot examples from train set
num_fewshot
:
5
# Few-shot examples from train set
max_model_len
:
4096
# Model context length
server_args
:
"
--max-model-len
4096
--tensor-parallel-size
2"
# Server arguments
env
:
# Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
```
```
The
`server_args`
field accepts any arguments that can be passed to
`vllm serve`
.
The
`env`
field accepts a dictionary of environment variables to set for the server process.
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
View file @
a810671a
...
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
...
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold
:
0.72
accuracy_threshold
:
0.72
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
View file @
a810671a
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold
:
0.74
accuracy_threshold
:
0.74
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
View file @
a810671a
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold
:
0.31
accuracy_threshold
:
0.31
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
View file @
a810671a
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold
:
0.45
accuracy_threshold
:
0.45
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max
_
model
_
len
:
4096
server_args
:
"
--enforce-eager
--
max
-
model
-
len
4096
"
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
View file @
a810671a
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold
:
0.60
accuracy_threshold
:
0.60
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment