[AMD] Add unit-test-sgl-kernel-amd to AMD CI (#7539)

3b3f1e3a · Hubert Lu · GitHub · b691dcc4 · 3b3f1e3a · 3b3f1e3a
Unverified Commit 3b3f1e3a authored Jun 29, 2025 by Hubert Lu Committed by GitHub Jun 29, 2025
3 changed files
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -290,12 +290,46 @@ jobs:
        run: |
          bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600

+      - name: Run CustomAllReduce test
+        timeout-minutes: 10
+        run: |
+          CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/amd_ci_exec.sh python3 -m unittest test_custom_allreduce.TestCustomAllReduce
+
+  unit-test-sgl-kernel-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+
  finish:
    if: always()
    needs: [
      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
      accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
-      unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd
+      unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd,
+      unit-test-sgl-kernel-amd
    ]
    runs-on: ubuntu-latest
    steps:

--- a/scripts/amd_ci_install_dependency.sh
+++ b/scripts/amd_ci_install_dependency.sh
@@ -19,3 +19,4 @@ mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpubli
 docker cp ./dummy-grok ci_sglang:/

 docker exec ci_sglang pip install huggingface_hub[hf_xet]
+docker exec ci_sglang pip install pytest
--- a/test/srt/test_custom_allreduce.py
+++ b/test/srt/test_custom_allreduce.py
@@ -56,22 +56,30 @@ def multi_process_parallel(


 class TestCustomAllReduce(CustomTestCase):
+    TEST_SIZES = [
+        512,
+        4096,
+        32768,
+        262144,
+        2097152,
+        16777216,
+        33554432,
+    ]  # 512B...32MB
+    WORLD_SIZES = [2, 4, 6, 8]
+    TEST_LOOP = 10
+
    @classmethod
    def setUpClass(cls):
-        random.seed(42)
-        # 512B to 32MB
-        cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432]
-        cls.world_sizes = [2, 4, 6, 8]
-        cls.test_loop = 10
+        random.seed(42)  # keep the deterministic seed

    def test_graph_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
            if world_size > torch.cuda.device_count():
                continue
            multi_process_parallel(world_size, self, self.graph_allreduce)

    def test_eager_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
            if world_size > torch.cuda.device_count():
                continue
            multi_process_parallel(world_size, self, self.eager_allreduce)
@@ -102,9 +110,9 @@ class TestCustomAllReduce(CustomTestCase):
        torch.cuda.synchronize()
        del data

-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                    with graph_capture() as graph_capture_context:
                        # use integers so result matches NCCL exactly
                        inp1 = torch.randint(
@@ -151,9 +159,9 @@ class TestCustomAllReduce(CustomTestCase):
        initialize_model_parallel(tensor_model_parallel_size=world_size)
        group = get_tensor_model_parallel_group().device_group

-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                    inp1 = torch.randint(
                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
                    )