Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
cbac4997
Unverified
Commit
cbac4997
authored
Oct 16, 2025
by
YanbingJiang
Committed by
GitHub
Oct 15, 2025
Browse files
Split test_intel_amx_attention_backend.py to pass CI of timeout (#11370)
Co-authored-by:
Ma Mingfei
<
mingfei.ma@intel.com
>
parent
476c67d7
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
197 additions
and
5 deletions
+197
-5
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+31
-1
test/srt/cpu/test_cpu_graph.py
test/srt/cpu/test_cpu_graph.py
+1
-2
test/srt/cpu/test_intel_amx_attention_backend_a.py
test/srt/cpu/test_intel_amx_attention_backend_a.py
+73
-0
test/srt/cpu/test_intel_amx_attention_backend_b.py
test/srt/cpu/test_intel_amx_attention_backend_b.py
+35
-0
test/srt/cpu/test_intel_amx_attention_backend_c.py
test/srt/cpu/test_intel_amx_attention_backend_c.py
+53
-0
test/srt/run_suite.py
test/srt/run_suite.py
+4
-2
No files found.
python/sglang/test/test_utils.py
View file @
cbac4997
...
...
@@ -16,7 +16,7 @@ import unittest
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
functools
import
partial
from
functools
import
partial
,
wraps
from
pathlib
import
Path
from
types
import
SimpleNamespace
from
typing
import
Any
,
Awaitable
,
Callable
,
List
,
Optional
,
Tuple
...
...
@@ -1807,3 +1807,33 @@ def write_results_to_json(model, metrics, mode="a"):
with
open
(
"results.json"
,
"w"
)
as
f
:
json
.
dump
(
existing_results
,
f
,
indent
=
2
)
def
intel_amx_benchmark
(
extra_args
=
None
,
min_throughput
=
None
):
def
decorator
(
test_func
):
@
wraps
(
test_func
)
def
wrapper
(
self
):
common_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--disable-radix"
,
"--trust-remote-code"
,
]
full_args
=
common_args
+
(
extra_args
or
[])
model
=
test_func
(
self
)
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
model
,
full_args
)
print
(
f
"
{
model
=
}
"
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
()
and
min_throughput
is
not
None
:
self
.
assertGreater
(
decode_throughput
,
min_throughput
)
return
wrapper
return
decorator
test/srt/test_cpu_graph.py
→
test/srt/
cpu/
test_cpu_graph.py
View file @
cbac4997
...
...
@@ -8,8 +8,6 @@ import os
import
unittest
from
types
import
SimpleNamespace
from
test_intel_amx_attention_backend
import
intel_amx_benchmark
from
sglang.srt.utils
import
get_cpu_ids_by_node
,
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
...
...
@@ -17,6 +15,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
intel_amx_benchmark
,
is_in_ci
,
popen_launch_server
,
)
...
...
test/srt/test_intel_amx_attention_backend.py
→
test/srt/
cpu/
test_intel_amx_attention_backend
_a
.py
View file @
cbac4997
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_
mmlu
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_
latency_default_model
"""
import
unittest
from
functools
import
wraps
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
...
...
@@ -12,91 +11,30 @@ from sglang.test.run_eval import run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
intel_amx_benchmark
,
is_in_ci
,
popen_launch_server
,
run_bench_one_batch
,
)
def
intel_amx_benchmark
(
extra_args
=
None
,
min_throughput
=
None
):
def
decorator
(
test_func
):
@
wraps
(
test_func
)
def
wrapper
(
self
):
common_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--disable-radix"
,
"--trust-remote-code"
,
]
full_args
=
common_args
+
(
extra_args
or
[])
model
=
test_func
(
self
)
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
model
,
full_args
)
print
(
f
"
{
model
=
}
"
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
()
and
min_throughput
is
not
None
:
self
.
assertGreater
(
decode_throughput
,
min_throughput
)
return
wrapper
return
decorator
class
TestIntelAMXAttnBackend
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
10
)
def
test_latency_mla_model
(
self
):
return
DEFAULT_MLA_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
40
)
def
test_latency_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
150
)
def
test_latency_fp8_qwen
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
],
min_throughput
=
50
)
def
test_latency_fp8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--
quantiz
ati
on
"
,
"
w8a8_int8
"
],
min_throughput
=
10
0
,
extra_args
=
[
"--batch-size"
,
"4"
,
"--
mem-fraction-st
ati
c
"
,
"
0.3
"
],
min_throughput
=
10
,
)
def
test_latency_
w8a8_default
_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
_W8A8
def
test_latency_
mla
_model
(
self
):
return
DEFAULT_
MLA_
MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.9"
,
"--max-total-tokens"
,
"65536"
,
"--tp"
,
"6"
,
],
min_throughput
=
100
,
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
min_throughput
=
40
,
)
def
test_latency_
w8a8_moe
_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
_W8A8_WITH_MOE
def
test_latency_
default
_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
def
test_mmlu
(
self
):
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
...
...
test/srt/cpu/test_intel_amx_attention_backend_b.py
0 → 100644
View file @
cbac4997
"""
For intel_amx attention backend FP8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_1.TestIntelAMXAttnBackendQuant.test_latency_fp8_qwen
"""
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
,
CustomTestCase
,
intel_amx_benchmark
,
)
class
TestIntelAMXAttnBackendQuant
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
min_throughput
=
150
,
)
def
test_latency_fp8_qwen
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--mem-fraction-static"
,
"0.1"
],
min_throughput
=
50
,
)
def
test_latency_fp8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/cpu/test_intel_amx_attention_backend_c.py
0 → 100644
View file @
cbac4997
"""
For intel_amx attention backend w8a8 tests
Usage:
python3 -m unittest test_intel_amx_attention_backend_2.TestIntelAMXAttnBackendQuant.test_latency_w8a8_default_model
"""
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
,
CustomTestCase
,
intel_amx_benchmark
,
)
class
TestIntelAMXAttnBackendQuant
(
CustomTestCase
):
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.1"
,
],
min_throughput
=
100
,
)
def
test_latency_w8a8_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@
intel_amx_benchmark
(
extra_args
=
[
"--batch-size"
,
"4"
,
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.9"
,
"--max-total-tokens"
,
"65536"
,
"--tp"
,
"6"
,
],
min_throughput
=
100
,
)
def
test_latency_w8a8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/run_suite.py
View file @
cbac4997
...
...
@@ -312,8 +312,10 @@ suite_xeon = {
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"test_cpu_graph.py"
),
TestFile
(
"test_intel_amx_attention_backend.py"
),
TestFile
(
"cpu/test_cpu_graph.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_a.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_b.py"
),
TestFile
(
"cpu/test_intel_amx_attention_backend_c.py"
),
],
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment