Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4de03953
Unverified
Commit
4de03953
authored
Jul 04, 2025
by
YanbingJiang
Committed by
GitHub
Jul 03, 2025
Browse files
Add V2-lite model test (#7390)
Co-authored-by:
DiweiSun
<
105627594+DiweiSun@users.noreply.github.com
>
parent
8b1942c6
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
98 additions
and
6 deletions
+98
-6
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+15
-3
test/srt/models/test_dummy_grok_models.py
test/srt/models/test_dummy_grok_models.py
+1
-1
test/srt/run_suite.py
test/srt/run_suite.py
+1
-0
test/srt/test_bench_one_batch.py
test/srt/test_bench_one_batch.py
+1
-1
test/srt/test_flashmla.py
test/srt/test_flashmla.py
+1
-1
test/srt/test_intel_amx_attention_backend.py
test/srt/test_intel_amx_attention_backend.py
+79
-0
No files found.
python/sglang/test/test_utils.py
View file @
4de03953
...
@@ -5,6 +5,7 @@ import copy
...
@@ -5,6 +5,7 @@ import copy
import
logging
import
logging
import
os
import
os
import
random
import
random
import
re
import
subprocess
import
subprocess
import
threading
import
threading
import
time
import
time
...
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
...
@@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
print
(
f
"Output:
{
output
}
"
,
flush
=
True
)
print
(
f
"Output:
{
output
}
"
,
flush
=
True
)
print
(
f
"Error:
{
error
}
"
,
flush
=
True
)
print
(
f
"Error:
{
error
}
"
,
flush
=
True
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
# Return prefill_latency, decode_throughput, decode_latency
output_throughput
=
float
(
lastline
.
split
(
" "
)[
-
2
])
prefill_line
=
output
.
split
(
"
\n
"
)[
-
9
]
decode_line
=
output
.
split
(
"
\n
"
)[
-
3
]
pattern
=
(
r
"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
)
match
=
re
.
search
(
pattern
,
prefill_line
)
if
match
:
prefill_latency
=
float
(
match
.
group
(
"latency"
))
match
=
re
.
search
(
pattern
,
decode_line
)
if
match
:
decode_latency
=
float
(
match
.
group
(
"latency"
))
decode_throughput
=
float
(
match
.
group
(
"throughput"
))
finally
:
finally
:
kill_process_tree
(
process
.
pid
)
kill_process_tree
(
process
.
pid
)
return
output_throughput
return
prefill_latency
,
decode_throughput
,
decode_latency
def
run_bench_offline_throughput
(
model
,
other_args
):
def
run_bench_offline_throughput
(
model
,
other_args
):
...
...
test/srt/models/test_dummy_grok_models.py
View file @
4de03953
...
@@ -6,7 +6,7 @@ from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
...
@@ -6,7 +6,7 @@ from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
class
TestDummyGrok1
(
CustomTestCase
):
class
TestDummyGrok1
(
CustomTestCase
):
def
test_dummy_grok_1
(
self
):
def
test_dummy_grok_1
(
self
):
output_throughput
=
run_bench_one_batch
(
_
,
output_throughput
,
_
=
run_bench_one_batch
(
None
,
None
,
[
[
"--model"
,
"--model"
,
...
...
test/srt/run_suite.py
View file @
4de03953
...
@@ -198,6 +198,7 @@ suites = {
...
@@ -198,6 +198,7 @@ suites = {
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"test_intel_amx_attention_backend.py"
),
],
],
"nightly"
:
[
"nightly"
:
[
TestFile
(
"test_nightly_gsm8k_eval.py"
),
TestFile
(
"test_nightly_gsm8k_eval.py"
),
...
...
test/srt/test_bench_one_batch.py
View file @
4de03953
...
@@ -20,7 +20,7 @@ from sglang.test.test_utils import (
...
@@ -20,7 +20,7 @@ from sglang.test.test_utils import (
class
TestBenchOneBatch
(
CustomTestCase
):
class
TestBenchOneBatch
(
CustomTestCase
):
def
test_bs1_small
(
self
):
def
test_bs1_small
(
self
):
output_throughput
=
run_bench_one_batch
(
_
,
output_throughput
,
_
=
run_bench_one_batch
(
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
[
"--cuda-graph-max-bs"
,
"2"
]
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
[
"--cuda-graph-max-bs"
,
"2"
]
)
)
self
.
assertGreater
(
output_throughput
,
50
)
self
.
assertGreater
(
output_throughput
,
50
)
...
...
test/srt/test_flashmla.py
View file @
4de03953
...
@@ -67,7 +67,7 @@ class TestFlashMLAAttnBackend(unittest.TestCase):
...
@@ -67,7 +67,7 @@ class TestFlashMLAAttnBackend(unittest.TestCase):
class
TestFlashMLAAttnLatency
(
unittest
.
TestCase
):
class
TestFlashMLAAttnLatency
(
unittest
.
TestCase
):
def
test_latency
(
self
):
def
test_latency
(
self
):
output_throughput
=
run_bench_one_batch
(
_
,
output_throughput
,
_
=
run_bench_one_batch
(
DEFAULT_MODEL_NAME_FOR_TEST_MLA
,
DEFAULT_MODEL_NAME_FOR_TEST_MLA
,
[
[
"--attention-backend"
,
"--attention-backend"
,
...
...
test/srt/test_intel_amx_attention_backend.py
0 → 100644
View file @
4de03953
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
"""
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_one_batch
,
)
class
TestIntelAMXAttnBackend
(
CustomTestCase
):
def
test_latency
(
self
):
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
[
"--attention-backend"
,
"intel_amx"
,
"--mem-fraction-static"
,
"0.05"
,
"--disable-radix"
,
"--trust-remote-code"
,
"--batch-size"
,
"4"
,
],
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
():
self
.
assertGreater
(
decode_throughput
,
10
)
def
test_mmlu
(
self
):
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--mem-fraction-static"
,
"0.05"
,
"--disable-radix"
,
"--trust-remote-code"
,
"--disable-overlap-schedule"
,
],
)
try
:
args
=
SimpleNamespace
(
base_url
=
base_url
,
model
=
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreater
(
metrics
[
"score"
],
0.5
)
finally
:
kill_process_tree
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment