Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3b3f1e3a
Unverified
Commit
3b3f1e3a
authored
Jun 29, 2025
by
Hubert Lu
Committed by
GitHub
Jun 29, 2025
Browse files
[AMD] Add unit-test-sgl-kernel-amd to AMD CI (#7539)
parent
b691dcc4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
12 deletions
+55
-12
.github/workflows/pr-test-amd.yml
.github/workflows/pr-test-amd.yml
+35
-1
scripts/amd_ci_install_dependency.sh
scripts/amd_ci_install_dependency.sh
+1
-0
test/srt/test_custom_allreduce.py
test/srt/test_custom_allreduce.py
+19
-11
No files found.
.github/workflows/pr-test-amd.yml
View file @
3b3f1e3a
...
...
@@ -290,12 +290,46 @@ jobs:
run
:
|
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
-
name
:
Run CustomAllReduce test
timeout-minutes
:
10
run
:
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/amd_ci_exec.sh python3 -m unittest test_custom_allreduce.TestCustomAllReduce
unit-test-sgl-kernel-amd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
strategy
:
fail-fast
:
false
matrix
:
runner
:
[
linux-mi300-gpu-1
,
linux-mi325-gpu-1
]
runs-on
:
${{matrix.runner}}
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Start CI container
run
:
bash scripts/amd_ci_start_container.sh
env
:
GITHUB_WORKSPACE
:
${{ github.workspace }}
-
name
:
Install dependencies
run
:
|
bash scripts/amd_ci_install_dependency.sh
-
name
:
Run test
timeout-minutes
:
10
run
:
|
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
finish
:
if
:
always()
needs
:
[
accuracy-test-1-gpu-amd
,
mla-test-1-gpu-amd
,
bench-test-2-gpu-amd
,
accuracy-test-2-gpu-amd
,
performance-test-1-gpu-part-1-amd
,
performance-test-1-gpu-part-2-amd
,
unit-test-backend-1-gpu-amd
,
unit-test-backend-2-gpu-amd
,
unit-test-backend-8-gpu-amd
unit-test-backend-1-gpu-amd
,
unit-test-backend-2-gpu-amd
,
unit-test-backend-8-gpu-amd
,
unit-test-sgl-kernel-amd
]
runs-on
:
ubuntu-latest
steps
:
...
...
scripts/amd_ci_install_dependency.sh
View file @
3b3f1e3a
...
...
@@ -19,3 +19,4 @@ mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpubli
docker
cp
./dummy-grok ci_sglang:/
docker
exec
ci_sglang pip
install
huggingface_hub[hf_xet]
docker
exec
ci_sglang pip
install
pytest
test/srt/test_custom_allreduce.py
View file @
3b3f1e3a
...
...
@@ -56,22 +56,30 @@ def multi_process_parallel(
class
TestCustomAllReduce
(
CustomTestCase
):
TEST_SIZES
=
[
512
,
4096
,
32768
,
262144
,
2097152
,
16777216
,
33554432
,
]
# 512B...32MB
WORLD_SIZES
=
[
2
,
4
,
6
,
8
]
TEST_LOOP
=
10
@
classmethod
def
setUpClass
(
cls
):
random
.
seed
(
42
)
# 512B to 32MB
cls
.
test_sizes
=
[
512
,
4096
,
32768
,
262144
,
2097152
,
16777216
,
33554432
]
cls
.
world_sizes
=
[
2
,
4
,
6
,
8
]
cls
.
test_loop
=
10
random
.
seed
(
42
)
# keep the deterministic seed
def
test_graph_allreduce
(
self
):
for
world_size
in
self
.
world_sizes
:
for
world_size
in
self
.
WORLD_SIZES
:
if
world_size
>
torch
.
cuda
.
device_count
():
continue
multi_process_parallel
(
world_size
,
self
,
self
.
graph_allreduce
)
def
test_eager_allreduce
(
self
):
for
world_size
in
self
.
world_sizes
:
for
world_size
in
self
.
WORLD_SIZES
:
if
world_size
>
torch
.
cuda
.
device_count
():
continue
multi_process_parallel
(
world_size
,
self
,
self
.
eager_allreduce
)
...
...
@@ -102,9 +110,9 @@ class TestCustomAllReduce(CustomTestCase):
torch
.
cuda
.
synchronize
()
del
data
for
sz
in
self
.
test_sizes
:
for
sz
in
self
.
TEST_SIZES
:
for
dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
for
_
in
range
(
self
.
test_loop
):
for
_
in
range
(
self
.
TEST_LOOP
):
with
graph_capture
()
as
graph_capture_context
:
# use integers so result matches NCCL exactly
inp1
=
torch
.
randint
(
...
...
@@ -151,9 +159,9 @@ class TestCustomAllReduce(CustomTestCase):
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
group
=
get_tensor_model_parallel_group
().
device_group
for
sz
in
self
.
test_sizes
:
for
sz
in
self
.
TEST_SIZES
:
for
dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
for
_
in
range
(
self
.
test_loop
):
for
_
in
range
(
self
.
TEST_LOOP
):
inp1
=
torch
.
randint
(
1
,
16
,
(
sz
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
()
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment