Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
73bc1d00
Unverified
Commit
73bc1d00
authored
May 01, 2025
by
Sai Enduri
Committed by
GitHub
May 01, 2025
Browse files
Add 1 gpu perf and 2 gpu accuracy tests for AMD MI300x CI. (#5960)
parent
c5645e92
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
171 additions
and
11 deletions
+171
-11
.github/workflows/pr-test-amd.yml
.github/workflows/pr-test-amd.yml
+147
-5
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+24
-6
No files found.
.github/workflows/pr-test-amd.yml
View file @
73bc1d00
...
...
@@ -55,10 +55,6 @@ jobs:
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/
-
name
:
Evaluate Accuracy
timeout-minutes
:
20
run
:
|
...
...
@@ -66,6 +62,44 @@ jobs:
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py
accuracy-test-2-gpu-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
linux-mi300-gpu-2
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Setup docker
run
:
|
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428
-
name
:
Install dependencies
run
:
|
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
-
name
:
Evaluate accuracy (TP=2)
timeout-minutes
:
20
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_moe_eval_accuracy_large.py
mla-test-1-gpu-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
...
...
@@ -104,6 +138,113 @@ jobs:
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
performance-test-1-gpu-part-1-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
linux-mi300-gpu-1
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Setup docker
run
:
|
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428
-
name
:
Install dependencies
run
:
|
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
-
name
:
Benchmark single latency
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
-
name
:
Benchmark online latency
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
-
name
:
Benchmark offline throughput
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
-
name
:
Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-
name
:
Benchmark online latency (EAGLE)
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
performance-test-1-gpu-part-2-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
linux-mi300-gpu-1
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Setup docker
run
:
|
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
ghcr.io/saienduri/sglang-aiter-v0.1.1:428
-
name
:
Install dependencies
run
:
|
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
-
name
:
Benchmark offline throughput (w/o RadixAttention)
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
-
name
:
Benchmark offline throughput (w/ Triton)
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
-
name
:
Benchmark offline throughput (w/ FP8)
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
bench-test-2-gpu-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
...
...
@@ -169,7 +310,8 @@ jobs:
finish
:
if
:
always()
needs
:
[
accuracy-test-1-gpu-amd
,
mla-test-1-gpu-amd
,
bench-test-2-gpu-amd
accuracy-test-1-gpu-amd
,
mla-test-1-gpu-amd
,
bench-test-2-gpu-amd
,
accuracy-test-2-gpu-amd
,
performance-test-1-gpu-part-1-amd
,
performance-test-1-gpu-part-2-amd
]
runs-on
:
ubuntu-latest
steps
:
...
...
test/srt/test_bench_serving.py
View file @
73bc1d00
...
...
@@ -29,6 +29,9 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_default
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
else
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3800
)
def
test_offline_throughput_non_stream_small_batch_size
(
self
):
...
...
@@ -64,6 +67,9 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_without_radix_cache
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
else
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3800
)
def
test_offline_throughput_without_chunked_prefill
(
self
):
...
...
@@ -99,6 +105,9 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_with_triton_attention_backend
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
else
:
self
.
assertGreater
(
res
[
"output_throughput"
],
3700
)
def
test_offline_throughput_default_fp8
(
self
):
...
...
@@ -114,6 +123,9 @@ class TestBenchServing(CustomTestCase):
f
"### test_offline_throughput_default_fp8
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertGreater
(
res
[
"output_throughput"
],
4000
)
else
:
self
.
assertGreater
(
res
[
"output_throughput"
],
4300
)
def
test_online_latency_default
(
self
):
...
...
@@ -130,6 +142,9 @@ class TestBenchServing(CustomTestCase):
f
'median_e2e_latency_ms:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
11000
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertLess
(
res
[
"median_ttft_ms"
],
115
)
else
:
self
.
assertLess
(
res
[
"median_ttft_ms"
],
86
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
10
)
...
...
@@ -165,6 +180,9 @@ class TestBenchServing(CustomTestCase):
f
'median_e2e_latency_ms:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
f
'accept_length:
{
res
[
"accept_length"
]:.
2
f
}
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
1450
)
else
:
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
900
)
self
.
assertGreater
(
res
[
"accept_length"
],
3.0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment