Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
daed453e
Unverified
Commit
daed453e
authored
Apr 27, 2025
by
Lianmin Zheng
Committed by
GitHub
Apr 27, 2025
Browse files
[CI] Improve github summary & enable fa3 for more models (#5796)
parent
ded04b2e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
26 additions
and
20 deletions
+26
-20
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+1
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+1
-0
test/srt/run_suite.py
test/srt/run_suite.py
+1
-1
test/srt/test_bench_one_batch.py
test/srt/test_bench_one_batch.py
+7
-8
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+5
-5
test/srt/test_full_deepseek_v3.py
test/srt/test_full_deepseek_v3.py
+11
-5
No files found.
.github/workflows/pr-test.yml
View file @
daed453e
...
...
@@ -123,7 +123,7 @@ jobs:
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
_default
-
name
:
Benchmark online latency
timeout-minutes
:
10
...
...
python/sglang/srt/utils.py
View file @
daed453e
...
...
@@ -1970,6 +1970,7 @@ def is_fa3_default_architecture(hf_config):
"Llama4ForConditionalGeneration"
,
"LlamaForCausalLM"
,
"MistralForCausalLM"
,
"MixtralForCausalLM"
,
"Gemma2ForCausalLM"
,
"Gemma3ForConditionalGeneration"
,
}
...
...
test/srt/run_suite.py
View file @
daed453e
...
...
@@ -64,7 +64,7 @@ suites = {
TestFile
(
"test_retract_decode.py"
,
54
),
TestFile
(
"test_server_args.py"
,
1
),
TestFile
(
"test_skip_tokenizer_init.py"
,
117
),
TestFile
(
"test_srt_engine.py"
,
2
37
),
TestFile
(
"test_srt_engine.py"
,
2
61
),
TestFile
(
"test_srt_endpoint.py"
,
130
),
TestFile
(
"test_torch_compile.py"
,
76
),
TestFile
(
"test_torch_compile_moe.py"
,
172
),
...
...
test/srt/test_bench_one_batch.py
View file @
daed453e
...
...
@@ -4,7 +4,6 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
CustomTestCase
,
get_bool_env_var
,
is_in_ci
,
run_bench_one_batch
,
write_github_step_summary
,
...
...
@@ -12,15 +11,15 @@ from sglang.test.test_utils import (
class
TestBenchOneBatch
(
CustomTestCase
):
def
test_bs1
(
self
):
def
test_bs1
_default
(
self
):
output_throughput
=
run_bench_one_batch
(
DEFAULT_MODEL_NAME_FOR_TEST
,
[
"--cuda-graph-max-bs"
,
"2"
]
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_bs1
\n
"
f
"output_throughput
:
{
output_throughput
:.
2
f
}
token/s
\n
"
f
"### test_bs1
_default (llama-3.1-8b)
\n
"
f
"output_throughput:
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
135
)
...
...
@@ -32,9 +31,9 @@ class TestBenchOneBatch(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_tp2_bs1
\n
"
f
"output_throughput
:
{
output_throughput
:.
2
f
}
token/s
\n
"
f
"output_throughput:
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
12
4
)
self
.
assertGreater
(
output_throughput
,
12
5
)
def
test_torch_compile_tp2_bs1
(
self
):
output_throughput
=
run_bench_one_batch
(
...
...
@@ -45,9 +44,9 @@ class TestBenchOneBatch(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_torch_compile_tp2_bs1
\n
"
f
"output_throughput
:
{
output_throughput
:.
2
f
}
token/s
\n
"
f
"output_throughput:
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
22
5
)
self
.
assertGreater
(
output_throughput
,
22
0
)
if
__name__
==
"__main__"
:
...
...
test/srt/test_bench_serving.py
View file @
daed453e
...
...
@@ -98,7 +98,7 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_with_triton_attention_backend
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
self
.
assertGreater
(
res
[
"output_throughput"
],
3
6
00
)
self
.
assertGreater
(
res
[
"output_throughput"
],
3
7
00
)
def
test_offline_throughput_default_fp8
(
self
):
res
=
run_bench_serving
(
...
...
@@ -113,7 +113,7 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_default_fp8
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
self
.
assertGreater
(
res
[
"output_throughput"
],
4
2
00
)
self
.
assertGreater
(
res
[
"output_throughput"
],
4
3
00
)
def
test_online_latency_default
(
self
):
res
=
run_bench_serving
(
...
...
@@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_default
\n
"
f
'median_e2e_latency_ms
:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
f
'median_e2e_latency_ms:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
11000
)
self
.
assertLess
(
res
[
"median_ttft_ms"
],
86
)
...
...
@@ -161,8 +161,8 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_eagle
\n
"
f
'median_e2e_latency_ms
:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
f
'accept_length
:
{
res
[
"accept_length"
]:.
2
f
}
\n
'
f
'median_e2e_latency_ms:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
f
'accept_length:
{
res
[
"accept_length"
]:.
2
f
}
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
900
)
self
.
assertGreater
(
res
[
"accept_length"
],
3.0
)
...
...
test/srt/test_full_deepseek_v3.py
View file @
daed453e
...
...
@@ -2,7 +2,6 @@ import unittest
from
types
import
SimpleNamespace
import
requests
import
torch
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
...
...
@@ -49,7 +48,7 @@ class TestDeepseekV3(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
f
"
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
4
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
35
)
class
TestBenchOneBatch
(
CustomTestCase
):
...
...
@@ -58,11 +57,11 @@ class TestBenchOneBatch(CustomTestCase):
FULL_DEEPSEEK_V3_MODEL_PATH
,
[
"--trust-remote-code"
,
"--tp"
,
"8"
,
"--cuda-graph-max-bs"
,
"2"
],
)
print
(
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s"
)
print
(
f
"
{
output_throughput
=
:.
2
f
}
token/s"
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_bs1
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
f
"### test_bs1 (deepseek-v3)
\n
"
f
"
{
output_throughput
=
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
70
)
...
...
@@ -121,6 +120,13 @@ class TestDeepseekV3MTP(CustomTestCase):
print
(
f
"
{
avg_spec_accept_length
=
}
"
)
self
.
assertGreater
(
avg_spec_accept_length
,
3.2
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_gsm8k (deepseek-v3)
\n
"
f
'
{
metrics
[
"accuracy"
]
=
:.
3
f
}
\n
'
f
"
{
avg_spec_accept_length
=
:.
2
f
}
\n
"
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment