Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
2e763398
Unverified
Commit
2e763398
authored
Sep 29, 2025
by
Mick
Committed by
GitHub
Sep 28, 2025
Browse files
fix: show failed models in nightly ci (#10986)
parent
336e9a60
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
39 additions
and
32 deletions
+39
-32
python/sglang/bench_one_batch_server.py
python/sglang/bench_one_batch_server.py
+2
-3
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+0
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+35
-26
test/srt/test_nightly_text_models_gsm8k_eval.py
test/srt/test_nightly_text_models_gsm8k_eval.py
+1
-1
test/srt/test_nightly_vlms_mmmu_eval.py
test/srt/test_nightly_vlms_mmmu_eval.py
+1
-1
No files found.
python/sglang/bench_one_batch_server.py
View file @
2e763398
...
@@ -66,9 +66,8 @@ class BenchmarkResult(BaseModel):
...
@@ -66,9 +66,8 @@ class BenchmarkResult(BaseModel):
def
help_str
()
->
str
:
def
help_str
()
->
str
:
return
f
"""
return
f
"""
Note: To view the traces through perfetto-ui, please:
Note: To view the traces through perfetto-ui, please:
1. use Google Chrome
1. open with Google Chrome
2. enable popup
2. allow popup
"""
"""
def
to_markdown_row
(
def
to_markdown_row
(
...
...
python/sglang/srt/server_args.py
View file @
2e763398
...
@@ -51,7 +51,6 @@ from sglang.utils import is_in_ci
...
@@ -51,7 +51,6 @@ from sglang.utils import is_in_ci
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
# Define constants
# Define constants
LOAD_FORMAT_CHOICES
=
[
LOAD_FORMAT_CHOICES
=
[
"auto"
,
"auto"
,
...
...
python/sglang/test/test_utils.py
View file @
2e763398
...
@@ -1518,31 +1518,45 @@ def check_evaluation_test_results(
...
@@ -1518,31 +1518,45 @@ def check_evaluation_test_results(
summary
=
" | model | status | score | score_threshold |
\n
"
summary
=
" | model | status | score | score_threshold |
\n
"
summary
+=
"| ----- | ------ | ----- | --------------- |
\n
"
summary
+=
"| ----- | ------ | ----- | --------------- |
\n
"
for
model
,
accuracy
,
latency
in
results
:
results_dict
=
{
res
[
0
]:
(
res
[
1
],
res
[
2
])
for
res
in
results
}
accuracy_threshold
=
model_accuracy_thresholds
.
get
(
model
)
if
accuracy_threshold
is
None
:
print
(
f
"Warning: No threshold defined for model
{
model
}
"
)
continue
for
model
,
accuracy_threshold
in
sorted
(
model_accuracy_thresholds
.
items
()):
latency_threshold
=
(
latency_threshold
=
(
model_latency_thresholds
.
get
(
model
,
None
)
model_latency_thresholds
.
get
(
model
)
if
model_latency_thresholds
if
model_latency_thresholds
is
not
None
else
1e9
else
1e9
)
)
is_success
=
accuracy
>=
accuracy_threshold
and
latency
<=
latency_threshold
if
model
in
results_dict
:
status_emoji
=
"✅"
if
is_success
else
"❌"
accuracy
,
latency
=
results_dict
[
model
]
is_success
=
accuracy
>=
accuracy_threshold
and
latency
<=
latency_threshold
if
not
is_success
:
status_emoji
=
"✅"
if
is_success
else
"❌"
failed_models
.
append
(
f
"
\n
Score Check Failed:
{
model
}
\n
"
if
not
is_success
:
f
"Model
{
model
}
score (
{
accuracy
:.
4
f
}
) is below threshold (
{
accuracy_threshold
:.
4
f
}
)"
if
accuracy
<
accuracy_threshold
:
)
failed_models
.
append
(
f
"
\n
Score Check Failed:
{
model
}
\n
"
if
model_latency_thresholds
is
not
None
:
f
"Model
{
model
}
score (
{
accuracy
:.
4
f
}
) is below threshold (
{
accuracy_threshold
:.
4
f
}
)"
line
=
f
"|
{
model
}
|
{
status_emoji
}
|
{
accuracy
}
|
{
accuracy_threshold
}
|
{
latency
}
|
{
latency_threshold
}
\n
"
)
if
latency
>
latency_threshold
:
failed_models
.
append
(
f
"
\n
Latency Check Failed:
{
model
}
\n
"
f
"Model
{
model
}
latency (
{
latency
:.
4
f
}
) is above threshold (
{
latency_threshold
:.
4
f
}
)"
)
if
model_latency_thresholds
is
not
None
:
line
=
f
"|
{
model
}
|
{
status_emoji
}
|
{
accuracy
}
|
{
accuracy_threshold
}
|
{
latency
}
|
{
latency_threshold
}
\n
"
else
:
line
=
(
f
"|
{
model
}
|
{
status_emoji
}
|
{
accuracy
}
|
{
accuracy_threshold
}
\n
"
)
else
:
else
:
line
=
f
"|
{
model
}
|
{
status_emoji
}
|
{
accuracy
}
|
{
accuracy_threshold
}
\n
"
status_emoji
=
"❌"
failed_models
.
append
(
f
"Model failed to launch or be evaluated:
{
model
}
"
)
if
model_latency_thresholds
is
not
None
:
line
=
f
"|
{
model
}
|
{
status_emoji
}
| N/A |
{
accuracy_threshold
}
| N/A |
{
latency_threshold
}
\n
"
else
:
line
=
f
"|
{
model
}
|
{
status_emoji
}
| N/A |
{
accuracy_threshold
}
\n
"
summary
+=
line
summary
+=
line
...
@@ -1551,13 +1565,8 @@ def check_evaluation_test_results(
...
@@ -1551,13 +1565,8 @@ def check_evaluation_test_results(
if
is_in_ci
():
if
is_in_ci
():
write_github_step_summary
(
f
"##
{
test_name
}
\n
{
summary
}
"
)
write_github_step_summary
(
f
"##
{
test_name
}
\n
{
summary
}
"
)
some_model_failed_to_get_result
=
len
(
results
)
!=
(
if
failed_models
:
model_count
or
len
(
model_accuracy_thresholds
)
print
(
"Some models failed the evaluation."
)
)
if
some_model_failed_to_get_result
:
print
(
"Some model has failed to launch and be evaluated"
)
if
failed_models
or
some_model_failed_to_get_result
:
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
...
...
test/srt/test_nightly_text_models_gsm8k_eval.py
View file @
2e763398
...
@@ -24,7 +24,7 @@ MODEL_SCORE_THRESHOLDS = {
...
@@ -24,7 +24,7 @@ MODEL_SCORE_THRESHOLDS = {
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.85
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.85
,
"google/gemma-2-27b-it"
:
0.91
,
"google/gemma-2-27b-it"
:
0.91
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.95
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.95
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
2
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
16
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.86
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.86
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.83
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.83
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
...
...
test/srt/test_nightly_vlms_mmmu_eval.py
View file @
2e763398
...
@@ -24,7 +24,7 @@ MODEL_THRESHOLDS = {
...
@@ -24,7 +24,7 @@ MODEL_THRESHOLDS = {
),
),
ModelDeploySetup
(
"google/gemma-3-4b-it"
):
ModelEvalMetrics
(
0.360
,
10.9
),
ModelDeploySetup
(
"google/gemma-3-4b-it"
):
ModelEvalMetrics
(
0.360
,
10.9
),
ModelDeploySetup
(
"google/gemma-3n-E4B-it"
):
ModelEvalMetrics
(
0.360
,
15.3
),
ModelDeploySetup
(
"google/gemma-3n-E4B-it"
):
ModelEvalMetrics
(
0.360
,
15.3
),
ModelDeploySetup
(
"mistral-community/pixtral-12b"
):
ModelEvalMetrics
(
0.360
,
1
4.5
),
ModelDeploySetup
(
"mistral-community/pixtral-12b"
):
ModelEvalMetrics
(
0.360
,
1
6.6
),
ModelDeploySetup
(
"moonshotai/Kimi-VL-A3B-Instruct"
):
ModelEvalMetrics
(
0.330
,
22.3
),
ModelDeploySetup
(
"moonshotai/Kimi-VL-A3B-Instruct"
):
ModelEvalMetrics
(
0.330
,
22.3
),
ModelDeploySetup
(
"openbmb/MiniCPM-o-2_6"
):
ModelEvalMetrics
(
0.330
,
29.3
),
ModelDeploySetup
(
"openbmb/MiniCPM-o-2_6"
):
ModelEvalMetrics
(
0.330
,
29.3
),
ModelDeploySetup
(
"openbmb/MiniCPM-v-2_6"
):
ModelEvalMetrics
(
0.270
,
24.5
),
ModelDeploySetup
(
"openbmb/MiniCPM-v-2_6"
):
ModelEvalMetrics
(
0.270
,
24.5
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment