Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a810671a
Commit
a810671a
authored
Jan 08, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori
parents
86b5aefe
6a09612b
Changes
291
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
352 additions
and
26 deletions
+352
-26
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+134
-1
tests/entrypoints/instrumentator/__init__.py
tests/entrypoints/instrumentator/__init__.py
+0
-0
tests/entrypoints/instrumentator/test_metrics.py
tests/entrypoints/instrumentator/test_metrics.py
+2
-3
tests/entrypoints/openai/test_chat_error.py
tests/entrypoints/openai/test_chat_error.py
+1
-0
tests/entrypoints/openai/test_completion_error.py
tests/entrypoints/openai/test_completion_error.py
+1
-0
tests/entrypoints/openai/test_response_api_parsable_context.py
.../entrypoints/openai/test_response_api_parsable_context.py
+6
-0
tests/entrypoints/openai/test_response_api_simple.py
tests/entrypoints/openai/test_response_api_simple.py
+45
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+82
-10
tests/entrypoints/openai/test_transcription_validation_whisper.py
...trypoints/openai/test_transcription_validation_whisper.py
+32
-0
tests/entrypoints/openai/test_translation_validation.py
tests/entrypoints/openai/test_translation_validation.py
+33
-0
tests/entrypoints/rpc/__init__.py
tests/entrypoints/rpc/__init__.py
+0
-0
tests/entrypoints/rpc/test_collective_rpc.py
tests/entrypoints/rpc/test_collective_rpc.py
+1
-1
tests/entrypoints/sleep/__init__.py
tests/entrypoints/sleep/__init__.py
+0
-0
tests/entrypoints/sleep/test_sleep.py
tests/entrypoints/sleep/test_sleep.py
+1
-1
tests/evals/gsm8k/README.md
tests/evals/gsm8k/README.md
+9
-4
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+1
-2
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
...vals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...als/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+1
-1
No files found.
tests/engine/test_arg_utils.py
View file @
a810671a
...
...
@@ -9,7 +9,7 @@ from typing import Annotated, Literal
import
pytest
from
vllm.config
import
CompilationConfig
,
config
from
vllm.config
import
AttentionConfig
,
CompilationConfig
,
config
from
vllm.engine.arg_utils
import
(
EngineArgs
,
contains_type
,
...
...
@@ -298,6 +298,139 @@ def test_compilation_config():
)
def
test_attention_config
():
from
vllm.attention.backends.registry
import
AttentionBackendEnum
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
# default value
args
=
parser
.
parse_args
([])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
==
AttentionConfig
()
# set backend via dot notation
args
=
parser
.
parse_args
([
"--attention-config.backend"
,
"FLASH_ATTN"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
# set backend via --attention-backend shorthand
args
=
parser
.
parse_args
([
"--attention-backend"
,
"FLASHINFER"
])
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_backend
is
not
None
assert
engine_args
.
attention_backend
==
"FLASHINFER"
# set all fields via dot notation
args
=
parser
.
parse_args
(
[
"--attention-config.backend"
,
"FLASH_ATTN"
,
"--attention-config.flash_attn_version"
,
"3"
,
"--attention-config.use_prefill_decode_attention"
,
"true"
,
"--attention-config.flash_attn_max_num_splits_for_cuda_graph"
,
"16"
,
"--attention-config.use_cudnn_prefill"
,
"true"
,
"--attention-config.use_trtllm_ragged_deepseek_prefill"
,
"true"
,
"--attention-config.use_trtllm_attention"
,
"true"
,
"--attention-config.disable_flashinfer_prefill"
,
"true"
,
"--attention-config.disable_flashinfer_q_quantization"
,
"true"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASH_ATTN"
assert
engine_args
.
attention_config
.
flash_attn_version
==
3
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
True
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
16
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
True
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
True
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
True
# set to string form of a dict with all fields
args
=
parser
.
parse_args
(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}'
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
assert
engine_args
.
attention_config
.
backend
is
not
None
assert
engine_args
.
attention_config
.
backend
.
name
==
"FLASHINFER"
assert
engine_args
.
attention_config
.
flash_attn_version
==
2
assert
engine_args
.
attention_config
.
use_prefill_decode_attention
is
False
assert
engine_args
.
attention_config
.
flash_attn_max_num_splits_for_cuda_graph
==
8
assert
engine_args
.
attention_config
.
use_cudnn_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_ragged_deepseek_prefill
is
False
assert
engine_args
.
attention_config
.
use_trtllm_attention
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_prefill
is
False
assert
engine_args
.
attention_config
.
disable_flashinfer_q_quantization
is
False
# test --attention-backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
vllm_config
=
engine_args
.
create_engine_config
()
assert
vllm_config
.
attention_config
.
backend
==
AttentionBackendEnum
.
FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args
=
parser
.
parse_args
(
[
"--model"
,
"facebook/opt-125m"
,
"--attention-backend"
,
"FLASH_ATTN"
,
"--attention-config.backend"
,
"FLASHINFER"
,
]
)
assert
args
is
not
None
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
with
pytest
.
raises
(
ValueError
,
match
=
"mutually exclusive"
):
engine_args
.
create_engine_config
()
def
test_prefix_cache_default
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
args
=
parser
.
parse_args
([])
...
...
tests/entrypoints/instrumentator/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_metrics.py
→
tests/entrypoints/
instrumentator
/test_metrics.py
View file @
a810671a
...
...
@@ -14,11 +14,10 @@ import requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
tests.conftest
import
LocalAssetServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm
import
version
from
...conftest
import
LocalAssetServer
from
...utils
import
RemoteOpenAIServer
MODELS
=
{
"text"
:
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
,
"multimodal"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
,
...
...
tests/entrypoints/openai/test_chat_error.py
View file @
a810671a
...
...
@@ -76,6 +76,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request
,
trace_headers
,
priority
,
data_parallel_rank
,
):
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_completion_error.py
View file @
a810671a
...
...
@@ -73,6 +73,7 @@ def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
lora_request
,
trace_headers
,
priority
,
data_parallel_rank
,
):
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_response_api_parsable_context.py
View file @
a810671a
...
...
@@ -165,6 +165,7 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
model
=
model_name
,
input
=
"What is 13 * 24? Use python to calculate the result."
,
tools
=
[{
"type"
:
"code_interpreter"
,
"container"
:
{
"type"
:
"auto"
}}],
extra_body
=
{
"enable_response_messages"
:
True
},
temperature
=
0.0
,
)
...
...
@@ -178,3 +179,8 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
# make sure the correct math is in the final output
assert
response
.
output
[
3
].
type
==
"message"
assert
"312"
in
response
.
output
[
3
].
content
[
0
].
text
# test raw input_messages / output_messages
assert
len
(
response
.
input_messages
)
==
1
assert
len
(
response
.
output_messages
)
==
3
assert
"312"
in
response
.
output_messages
[
2
][
"message"
]
tests/entrypoints/openai/test_response_api_simple.py
View file @
a810671a
...
...
@@ -87,3 +87,48 @@ async def test_reasoning_item(client: OpenAI, model_name: str):
assert
response
.
output
[
0
].
type
==
"reasoning"
assert
response
.
output
[
1
].
type
==
"message"
assert
type
(
response
.
output
[
1
].
content
[
0
].
text
)
is
str
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming_output_consistency
(
client
:
OpenAI
,
model_name
:
str
):
"""Test that streaming delta text matches the final response output_text.
This test verifies that when using streaming mode:
1. The concatenated text from all 'response.output_text.delta' events
2. Matches the 'output_text' in the final 'response.completed' event
"""
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
"Say hello in one sentence."
,
stream
=
True
,
)
events
=
[]
async
for
event
in
response
:
events
.
append
(
event
)
assert
len
(
events
)
>
0
# Concatenate all delta text from streaming events
streaming_text
=
""
.
join
(
event
.
delta
for
event
in
events
if
event
.
type
==
"response.output_text.delta"
)
# Get the final response from the last event
response_completed_event
=
events
[
-
1
]
assert
response_completed_event
.
type
==
"response.completed"
assert
response_completed_event
.
response
.
status
==
"completed"
# Get output_text from the final response
final_output_text
=
response_completed_event
.
response
.
output_text
# Verify final response has output
assert
len
(
response_completed_event
.
response
.
output
)
>
0
# Verify streaming text matches final output_text
assert
streaming_text
==
final_output_text
,
(
f
"Streaming text does not match final output_text.
\n
"
f
"Streaming:
{
streaming_text
!
r
}
\n
"
f
"Final:
{
final_output_text
!
r
}
"
)
tests/entrypoints/openai/test_serving_chat.py
View file @
a810671a
...
...
@@ -52,8 +52,19 @@ def with_tool_parser(request) -> bool:
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
],
ids
=
[
"exclude_tools_when_tool_choice_none"
],
)
def
exclude_tools_when_tool_choice_none
(
request
)
->
bool
:
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
with_tool_parser
:
bool
):
def
default_server_args
(
with_tool_parser
:
bool
,
exclude_tools_when_tool_choice_none
:
bool
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--enforce-eager"
,
...
...
@@ -72,19 +83,16 @@ def default_server_args(with_tool_parser: bool):
"--enable-auto-tool-choice"
,
]
)
if
exclude_tools_when_tool_choice_none
:
args
.
append
(
"--exclude-tools-when-tool-choice-none"
)
return
args
@
pytest
.
fixture
(
scope
=
"module"
)
def
gptoss_server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
,
default_server_args
:
list
[
str
]
):
with
monkeypatch_module
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
def
gptoss_server
(
default_server_args
:
list
[
str
]):
server_args
=
default_server_args
+
[
"--attention-backend=TRITON_ATTN"
]
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
...
...
@@ -340,6 +348,69 @@ async def test_gpt_oss_tool_message_array_content(
assert
response_multi_array
.
choices
[
0
].
message
is
not
None
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_tool_choice_none
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
,
exclude_tools_when_tool_choice_none
:
bool
,
):
if
not
(
with_tool_parser
and
exclude_tools_when_tool_choice_none
):
pytest
.
skip
(
"skip tool_choice tests when non-tool or "
"--exclude-tools-when-tool-choice-none not set"
)
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"What's the temperature(in degrees Celsius) in Dallas?"
,
},
]
tool_choice_auto
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
tool_choice
=
"auto"
,
temperature
=
0.0
,
)
msg
=
tool_choice_auto
.
choices
[
0
].
message
assert
len
(
msg
.
tool_calls
)
==
1
tool_choice_none
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
tool_choice
=
"none"
,
temperature
=
0.0
,
)
msg
=
tool_choice_none
.
choices
[
0
].
message
assert
len
(
msg
.
tool_calls
)
==
0
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
...
...
@@ -401,6 +472,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
lora_request
,
trace_headers
,
priority
,
data_parallel_rank
,
):
return
dict
(
engine_prompt
),
{}
...
...
tests/entrypoints/openai/test_transcription_validation_whisper.py
View file @
a810671a
...
...
@@ -244,3 +244,35 @@ async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
)
assert
transcription
.
segments
is
not
None
assert
len
(
transcription
.
segments
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_audio_with_max_tokens
(
whisper_client
,
mary_had_lamb
):
transcription
=
await
whisper_client
.
audio
.
transcriptions
.
create
(
model
=
MODEL_NAME
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
1
},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
from
transformers
import
AutoTokenizer
tok
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
==
1
# max_completion_tokens > max_model_len
transcription
=
await
whisper_client
.
audio
.
transcriptions
.
create
(
model
=
MODEL_NAME
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
int
(
1e6
)},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
<
450
# ~Whisper max output len
tests/entrypoints/openai/test_translation_validation.py
View file @
a810671a
...
...
@@ -227,3 +227,36 @@ async def test_long_audio_request(foscolo, client_and_model):
)
out
=
json
.
loads
(
translation
)[
"text"
].
strip
().
lower
()
assert
out
.
count
(
"greek sea"
)
==
2
@
pytest
.
mark
.
asyncio
async
def
test_audio_with_max_tokens
(
mary_had_lamb
,
client_and_model
):
client
,
model_name
=
client_and_model
transcription
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
1
},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
print
(
out_text
)
from
transformers
import
AutoTokenizer
tok
=
AutoTokenizer
.
from_pretrained
(
model_name
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
==
1
# max_completion_tokens > max_model_len
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
response_format
=
"text"
,
temperature
=
0.0
,
extra_body
=
{
"max_completion_tokens"
:
int
(
1e6
)},
)
out
=
json
.
loads
(
transcription
)
out_text
=
out
[
"text"
]
print
(
out_text
)
out_tokens
=
tok
(
out_text
,
add_special_tokens
=
False
)[
"input_ids"
]
assert
len
(
out_tokens
)
<
450
# ~Whisper max output len
tests/entrypoints/rpc/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_collective_rpc.py
→
tests/entrypoints/
rpc
/test_collective_rpc.py
View file @
a810671a
...
...
@@ -37,7 +37,7 @@ def server():
"--max-num-seqs"
,
"128"
,
"--worker-extension-cls"
,
"tests.entrypoints.
openai
.test_collective_rpc.TestWorkerExtension"
,
"tests.entrypoints.
rpc
.test_collective_rpc.TestWorkerExtension"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
...
...
tests/entrypoints/sleep/__init__.py
0 → 100644
View file @
a810671a
tests/entrypoints/
openai
/test_sleep.py
→
tests/entrypoints/
sleep
/test_sleep.py
View file @
a810671a
...
...
@@ -4,7 +4,7 @@
import
requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
..
.utils
import
RemoteOpenAIServer
from
tests
.utils
import
RemoteOpenAIServer
MODEL_NAME
=
"meta-llama/Llama-3.2-1B"
...
...
tests/evals/gsm8k/README.md
View file @
a810671a
...
...
@@ -7,9 +7,8 @@ This directory contains a replacement for the lm-eval-harness GSM8K evaluation,
### Run tests with pytest (like buildkite)
```
bash
pytest
-s
-v
tests/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
\
--tp-size
=
1
pytest
-s
-v
tests/evals/gsm8k/test_gsm8k_correctness.py
\
--config-list-file
=
configs/models-small.txt
```
### Run standalone evaluation script
...
...
@@ -31,5 +30,11 @@ model_name: "Qwen/Qwen2.5-1.5B-Instruct"
accuracy_threshold
:
0.54
# Minimum expected accuracy
num_questions
:
1319
# Number of questions (default: full test set)
num_fewshot
:
5
# Few-shot examples from train set
max_model_len
:
4096
# Model context length
server_args
:
"
--max-model-len
4096
--tensor-parallel-size
2"
# Server arguments
env
:
# Environment variables (optional)
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
```
The
`server_args`
field accepts any arguments that can be passed to
`vllm serve`
.
The
`env`
field accepts a dictionary of environment variables to set for the server process.
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
View file @
a810671a
...
...
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold
:
0.72
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
View file @
a810671a
...
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold
:
0.74
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
View file @
a810671a
...
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold
:
0.31
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
View file @
a810671a
...
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold
:
0.45
num_questions
:
1319
num_fewshot
:
5
max
_
model
_
len
:
4096
server_args
:
"
--enforce-eager
--
max
-
model
-
len
4096
"
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
View file @
a810671a
...
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold
:
0.60
num_questions
:
1319
num_fewshot
:
5
max_model_len
:
4096
\ No newline at end of file
server_args
:
"
--enforce-eager
--max-model-len
4096"
Prev
1
2
3
4
5
6
7
8
9
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment