Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
029e0af3
Unverified
Commit
029e0af3
authored
Aug 21, 2025
by
DiweiSun
Committed by
GitHub
Aug 21, 2025
Browse files
ci: enhance xeon ci (#9395)
parent
64574ef8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
18 deletions
+81
-18
.github/workflows/pr-test-xeon.yml
.github/workflows/pr-test-xeon.yml
+5
-2
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+6
-0
test/srt/test_intel_amx_attention_backend.py
test/srt/test_intel_amx_attention_backend.py
+70
-16
No files found.
.github/workflows/pr-test-xeon.yml
View file @
029e0af3
...
...
@@ -28,6 +28,8 @@ jobs:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
xeon-gnr
env
:
HF_HOME
:
/home/sdp/.cache/huggingface
strategy
:
matrix
:
build_type
:
[
'
all'
]
...
...
@@ -46,6 +48,7 @@ jobs:
run
:
|
docker run -dt \
-v ${{ github.workspace }}:/sglang-checkout/ --ipc=host \
-v ${HF_HOME}:/root/.cache/huggingface \
--name ci_sglang_xeon \
sglang_xeon
...
...
@@ -67,13 +70,13 @@ jobs:
-
name
:
Run unit tests
if
:
steps.check_amx.outcome == 'success'
timeout-minutes
:
2
0
timeout-minutes
:
3
0
run
:
|
docker exec -w /sglang-checkout/ ci_sglang_xeon \
bash -c "cd ./test/srt && python3 run_suite.py --suite per-commit-cpu"
-
name
:
Change permission
timeout-minutes
:
2
0
timeout-minutes
:
2
run
:
|
docker exec -u root ci_sglang_xeon bash -c "
rm -rf /tmp/ci-home &&
...
...
python/sglang/test/test_utils.py
View file @
029e0af3
...
...
@@ -61,6 +61,12 @@ DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8
=
(
"nvidia/Llama-3.1-8B-Instruct-FP8"
)
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
=
"Qwen/Qwen3-1.7B-FP8"
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
=
"gaunernst/DeepSeek-V2-Lite-Chat-FP8"
# W8A8 models
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
=
"RedHatAI/Llama-3.2-3B-quantized.w8a8"
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
=
"nytopop/Qwen3-30B-A3B.w8a8"
# EAGLE
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
=
"meta-llama/Llama-2-7b-chat-hf"
...
...
test/srt/test_intel_amx_attention_backend.py
View file @
029e0af3
...
...
@@ -3,13 +3,20 @@ Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
"""
import
os
import
unittest
from
functools
import
wraps
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
,
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
,
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
...
...
@@ -19,28 +26,75 @@ from sglang.test.test_utils import (
)
class
TestIntelAMXAttnBackend
(
CustomTestCas
e
):
def
test_latency
(
self
):
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
[
def
intel_amx_benchmark
(
extra_args
=
None
,
min_throughput
=
Non
e
):
def
decorator
(
test_func
):
@
wraps
(
test_func
)
def
wrapper
(
self
):
common_args
=
[
"--attention-backend"
,
"intel_amx"
,
"--mem-fraction-static"
,
"0.05"
,
"--disable-radix"
,
"--trust-remote-code"
,
"--batch-size"
,
"4"
,
],
)
]
full_args
=
common_args
+
(
extra_args
or
[])
model
=
test_func
(
self
)
prefill_latency
,
decode_throughput
,
decode_latency
=
run_bench_one_batch
(
model
,
full_args
)
print
(
f
"
{
model
=
}
"
)
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
if
is_in_ci
()
and
min_throughput
is
not
None
:
self
.
assertGreater
(
decode_throughput
,
min_throughput
)
return
wrapper
print
(
f
"
{
prefill_latency
=
}
"
)
print
(
f
"
{
decode_throughput
=
}
"
)
print
(
f
"
{
decode_latency
=
}
"
)
return
decorator
if
is_in_ci
():
self
.
assertGreater
(
decode_throughput
,
10
)
class
TestIntelAMXAttnBackend
(
CustomTestCase
):
@
intel_amx_benchmark
(
min_throughput
=
10
)
def
test_latency_mla_model
(
self
):
return
DEFAULT_MLA_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
min_throughput
=
40
)
def
test_latency_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST
@
intel_amx_benchmark
(
min_throughput
=
150
)
def
test_latency_fp8_qwen
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_QWEN_FP8
@
intel_amx_benchmark
(
min_throughput
=
50
)
def
test_latency_fp8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE
@
intel_amx_benchmark
(
extra_args
=
[
"--quantization"
,
"w8a8_int8"
],
min_throughput
=
100
)
def
test_latency_w8a8_default_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8
@
intel_amx_benchmark
(
extra_args
=
[
"--quantization"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"0.9"
,
"--max-total-tokens"
,
"65536"
,
"--tp"
,
"6"
,
],
min_throughput
=
100
,
)
def
test_latency_w8a8_moe_model
(
self
):
return
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE
def
test_mmlu
(
self
):
model
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
...
...
@@ -68,9 +122,9 @@ class TestIntelAMXAttnBackend(CustomTestCase):
num_examples
=
64
,
num_threads
=
32
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreater
(
metrics
[
"score"
],
0.45
)
if
is_in_ci
():
self
.
assertGreater
(
metrics
[
"score"
],
0.45
)
finally
:
kill_process_tree
(
process
.
pid
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment