Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
cbac4997
Unverified
Commit
cbac4997
authored
Oct 16, 2025
by
YanbingJiang
Committed by
GitHub
Oct 15, 2025
Browse files
Split test_intel_amx_attention_backend.py to pass CI of timeout (#11370)
Co-authored-by:
Ma Mingfei
<
mingfei.ma@intel.com
>
parent
476c67d7
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
197 additions
and
5 deletions
+197
-5
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+31
-1
test/srt/cpu/test_cpu_graph.py
test/srt/cpu/test_cpu_graph.py
+1
-2
test/srt/cpu/test_intel_amx_attention_backend_a.py
test/srt/cpu/test_intel_amx_attention_backend_a.py
+73
-0
test/srt/cpu/test_intel_amx_attention_backend_b.py
test/srt/cpu/test_intel_amx_attention_backend_b.py
+35
-0
test/srt/cpu/test_intel_amx_attention_backend_c.py
test/srt/cpu/test_intel_amx_attention_backend_c.py
+53
-0
test/srt/run_suite.py
test/srt/run_suite.py
+4
-2
No files found.
python/sglang/test/test_utils.py
View file @
cbac4997
...
@@ -16,7 +16,7 @@ import unittest
...
@@ -16,7 +16,7 @@ import unittest
from
concurrent.futures
import
ThreadPoolExecutor
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
datetime
import
datetime
from
functools
import
partial
from
functools
import
partial
,
wraps
from
pathlib
import
Path
from
pathlib
import
Path
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
typing
import
Any
,
Awaitable
,
Callable
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Awaitable
,
Callable
,
List
,
Optional
,
Tuple
...
@@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"):
...
@@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"):
with
open
(
"results.json"
,
"w"
)
as
f
:
with
open
(
"results.json"
,
"w"
)
as
f
:
json
.
dump
(
existing_results
,
f
,
indent
=
2
)
json
.
dump
(
existing_results
,
f
,
indent
=
2
)
def
intel_amx_benchmark
(
extra_args
=
None
,
min_throughput
=
None
):
def
decorator
(
test_func
):
@
wraps
(
test_func
)
def
wrapper
(
self
):
common_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--disable-radix"
,
"--trust-remote-code"
,
]
full_args
=
common_args
+
(
extra_args
or
[])
model
=
test_func
(
self
)
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
model
,
full_args
)
print
(
f
"
{
model
=
}
"
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
()
and
min_throughput
is
not
None
:
self
.
assertGreater
(
decode_throughput
,
min_throughput
)
return
wrapper
return
decorator
test/srt/test_cpu_graph.py
→
test/srt/
cpu/
test_cpu_graph.py
View file @
cbac4997
...
@@ -8,8 +8,6 @@ import os
...
@@ -8,8 +8,6 @@ import os
import
unittest
import
unittest
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
test_intel_amx_attention_backend
import
intel_amx_benchmark
from
sglang.srt.utils
import
get_cpu_ids_by_node
,
kill_process_tree
from
sglang.srt.utils
import
get_cpu_ids_by_node
,
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
...
@@ -17,6 +15,7 @@ from sglang.test.test_utils import (
...
@@ -17,6 +15,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
intel_amx_benchmark
,
is_in_ci
,
is_in_ci
,
popen_launch_server
,
popen_launch_server
,
)
)
...
...
test/srt/test_intel_amx_attention_backend.py
→
test/srt/
cpu/
test_intel_amx_attention_backend
_a
.py
View file @
cbac4997
"""
"""
Usage:
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_
mmlu
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_
latency_default_model
"""
"""
import
unittest
import
unittest
from
functools
import
wraps
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.srt.utils
import
kill_process_tree
...
@@ -12,91 +11,30 @@ from sglang.test.run_eval import run_eval
...
@@ -12,91 +11,30 @@ from sglang.test.run_eval import run_eval
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
intel_amx_benchmark
,
is_in_ci
,
is_in_ci
,
popen_launch_server
,
popen_launch_server
,
run_bench_one_batch
,
)
)
def
intel_amx_benchmark
(
extra_args
=
None
,
min_throughput
=
None
):
def
decorator
(
test_func
):
@
wraps
(
test_func
)
def
wrapper
(
self
):
common_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--disable-radix"
,
"--trust-remote-code"
,
]
full_args
=
common_args
+
(
extra_args
or
[])
model
=
test_func
(
self
)
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
model
,
full_args
)
print
(
f
"
{
model
=
}
"
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
()
and
min_throughput
is
not
None
:
self
.
assertGreater
(
decode_throughput
,
min_throughput
)
return
wrapper
return
decorator
class
TestIntelAMXAttnBackend
(
CustomTestCase
):
class
TestIntelAMXAttnBackend
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
10
)
def
test_latency_mla_model
(
self
):
return
DEFAULT_MLA_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
40
)
def
test_latency_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
150
)
def
test_latency_fp8_qwen
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
50
)
def
test_latency_fp8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
@
intel_amx_benchmark
(
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--
quantiz
ati
on
"
,
"
w8a8_int8
"
],
extra_args
=
[
"--batch-size"
,
"4"
,
"--
mem-fraction-st
ati
c
"
,
"
0.3
"
],
min_throughput
=
10
0
,
min_throughput
=
10
,
)
)
def
test_latency_
w8a8_default
_model
(
self
):
def
test_latency_
mla
_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
_W8A8
return
DEFAULT_
MLA_
MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
@
intel_amx_benchmark
(
extra_args
=
[
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
"--batch-size"
,
min_throughput
=
40
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.9"
,
"--max-total-tokens"
,
"65536"
,
"--tp"
,
"6"
,
],
min_throughput
=
100
,
)
)
def
test_latency_
w8a8_moe
_model
(
self
):
def
test_latency_
default
_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
_W8A8_WITH_MOE
return
DEFAULT_MODEL_NAME_FOR_TEST
def
test_mmlu
(
self
):
def
test_mmlu
(
self
):
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
...
...
test/srt/cpu/test_intel_amx_attention_backend_b.py
0 → 100644
View file @
cbac4997
"""
For intel_amx attention backend FP8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_1.TestIntelAMXAttnBackendQuant.test_latency_fp8_qwen
"""
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
,
CustomTestCase
,
intel_amx_benchmark
,
)
class
TestIntelAMXAttnBackendQuant
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
min_throughput
=
150
,
)
def
test_latency_fp8_qwen
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
min_throughput
=
50
,
)
def
test_latency_fp8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/cpu/test_intel_amx_attention_backend_c.py
0 → 100644
View file @
cbac4997
"""
For intel_amx attention backend w8a8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
"""
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
,
CustomTestCase
,
intel_amx_benchmark
,
)
class
TestIntelAMXAttnBackendQuant
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.1"
,
],
min_throughput
=
100
,
)
def
test_latency_w8a8_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.9"
,
"--max-total-tokens"
,
"65536"
,
"--tp"
,
"6"
,
],
min_throughput
=
100
,
)
def
test_latency_w8a8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/run_suite.py
View file @
cbac4997
...
@@ -312,8 +312,10 @@ suite_xeon = {
...
@@ -312,8 +312,10 @@ suite_xeon = {
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"test_cpu_graph.py"
),
TestFile
(
"cpu/test_cpu_graph.py"
),
TestFile
(
"test_intel_amx_attention_backend.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_a.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_b.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_c.py"
),
],
],
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment