[ci]use H20 to run disaggregation test (#11543)

0dd6cf16 · Hank Han · GitHub · 0975ba99 · 0975ba99 · 0dd6cf16
Unverified Commit 0dd6cf16 authored Oct 17, 2025 by Hank Han Committed by GitHub Oct 16, 2025
6 changed files
--- a/.github/workflows/pr-test-h20.yml
+++ b/.github/workflows/pr-test-h20.yml
-name: PR Test (H20)
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-    types: [synchronize, labeled]
-  workflow_dispatch:
-    inputs:
-      version:
-        required: true
-        type: choice
-        default: 'release'
-        options:
-          - 'release'
-          - 'nightly'
-
-concurrency:
-  group: pr-test-h20-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  check-changes:
-    runs-on: ubuntu-latest
-    outputs:
-      h20_files: ${{ steps.filter.outputs.h20_files }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Fail if the PR does not have the 'run-ci' label
-        if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
-        run: |
-          echo "This pull request does not have the 'run-ci' label. Failing the workflow."
-          exit 1
-
-      - name: Fail if the PR is a draft
-        if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
-        run: |
-          echo "This pull request is a draft. Failing the workflow."
-          exit 1
-
-      - name: Detect file changes
-        id: filter
-        uses: dorny/paths-filter@v3
-        with:
-          filters: |
-            h20_files:
-              - "python/sglang/srt/models/deepseek*"
-              - "python/sglang/srt/layers/moe/**"
-              - ".github/workflows/pr-test-h20.yml"
-              - "python/pyproject.toml"
-
-  per-commit-8-gpu-h20:
-    needs: [check-changes]
-    if: needs.check-changes.outputs.h20_files == 'true'
-    runs-on: 8-gpu-h20
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          bash scripts/ci/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 20
-
-        run: |
-          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu-h20
-
-  pr-test-h20-finish:
-    needs: [
-      check-changes,
-      per-commit-8-gpu-h20,
-    ]
-    if: always()
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check all dependent job statuses
-        run: |
-          # Convert the 'needs' context to a JSON string
-          json_needs='${{ toJson(needs) }}'
-
-          # Get a list of all job names from the JSON keys
-          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
-
-          for job in $job_names; do
-            # For each job, extract its result
-            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
-
-            # Print the job name and its result
-            echo "$job: $result"
-
-            # Check for failure or cancellation and exit if found
-            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
-              echo "The above jobs failed."
-              exit 1
-            fi
-          done
-
-          # If the loop completes, all jobs were successful
-          echo "All jobs completed successfully"
-          exit 0
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -350,6 +350,39 @@ jobs:
          cd test/srt
          python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2

+  unit-test-backend-8-gpu-h20:
+    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
+    if: always() && !failure() && !cancelled() &&
+        ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+    runs-on: 8-gpu-h20
+    env:
+      SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
  performance-test-1-gpu-part-1:
    needs: [check-changes, sgl-kernel-build-wheels]
    if: always() && !failure() && !cancelled() &&

--- a/python/sglang/test/test_disaggregation_utils.py
+++ b/python/sglang/test/test_disaggregation_utils.py
+import logging
 import os
 import time
 import warnings
@@ -15,6 +16,8 @@ from sglang.test.test_utils import (
    popen_with_error_check,
 )

+logger = logging.getLogger(__name__)
+

 class TestDisaggregationBase(CustomTestCase):
    @classmethod
@@ -100,11 +103,28 @@ class TestDisaggregationBase(CustomTestCase):


 def get_rdma_devices_args():
+    def _parse_list_env(var_name: str):
+        val = os.getenv(var_name)
+        if not val:
+            return None
+        items = [x.strip() for x in val.split(",") if x.strip()]
+        return items or None
+
+    def _pick_default_pair(rdma_all_devices):
+        return [rdma_all_devices[0], rdma_all_devices[len(rdma_all_devices) // 2]]
+
+    rdma_all_devices = _parse_list_env("SGLANG_CI_RDMA_ALL_DEVICES") or [
+        f"mlx5_roce{i}" for i in range(8)
+    ]
+    logger.info("Resolved rdma_all_devices=%s", rdma_all_devices)
+
+    n_rdma = len(rdma_all_devices)
+
    # 1. Get visible GPU indices
    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
    if not cuda_visible_devices:
        warnings.warn("CUDA_VISIBLE_DEVICES is not set. Using default RDMA devices.")
-        return "mlx5_roce0,mlx5_roce4"
+        return ",".join(_pick_default_pair(rdma_all_devices))

    try:
        # Convert to list of integers (handling possible spaces and empty strings)
@@ -112,29 +132,27 @@ def get_rdma_devices_args():
            int(idx.strip()) for idx in cuda_visible_devices.split(",") if idx.strip()
        ]
        if not gpu_indices or len(gpu_indices) > 4:
-            return "mlx5_roce0,mlx5_roce4"
+            return ",".join(_pick_default_pair(rdma_all_devices))
    except ValueError:
        warnings.warn(f"Invalid CUDA_VISIBLE_DEVICES format: {cuda_visible_devices}")
-        return "mlx5_roce0,mlx5_roce4"
+        return ",".join(_pick_default_pair(rdma_all_devices))

    # 2. Calculate base RDMA index group (each group of 4 GPUs uses consecutive devices)
-    base_rdma_group = min(gpu_indices) // 4 * 4
-
-    # 3. Generate RDMA device names
-    rdma_devices = []
+    base_rdma_group = (min(gpu_indices) // 4) * 4
    for gpu_idx in gpu_indices:
-        # Validate GPU index within expected range
-        if gpu_idx < base_rdma_group or gpu_idx >= base_rdma_group + 4:
+        if not (base_rdma_group <= gpu_idx < base_rdma_group + 4):
            warnings.warn(
-                f"GPU index {gpu_idx} is outside expected group {base_rdma_group}-{base_rdma_group+3}"
+                f"GPU index {gpu_idx} is outside expected group "
+                f"{base_rdma_group}-{base_rdma_group+3}"
            )
-            continue

-        # Map GPU index to RDMA device index
-        rdma_index = base_rdma_group // 4 * 4 + (gpu_idx % 4)
-        rdma_devices.append(f"mlx5_roce{rdma_index}")
+    # 3. Generate RDMA device names
+    rdma_devices = []
+    for gpu_idx in gpu_indices:
+        nic_index = gpu_idx // (8 // n_rdma)
+        rdma_devices.append(rdma_all_devices[nic_index])

    if not rdma_devices:
-        return "mlx5_roce0,mlx5_roce4"
+        return ",".join(_pick_default_pair(rdma_all_devices))

    return ",".join(rdma_devices)
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -163,9 +163,7 @@ suites = {
        TestFile("lora/test_lora_llama4.py", 400),
        TestFile("test_deepseek_v3_basic.py", 275),
        TestFile("test_deepseek_v3_mtp.py", 275),
-        TestFile("test_disaggregation_different_tp.py", 600),
        TestFile("test_disaggregation_hybrid_attention.py", 200),
-        TestFile("test_disaggregation_pp.py", 140),
    ],
    "per-commit-4-gpu-b200": [
        # TestFile("test_gpt_oss_4gpu.py", 600),
@@ -182,6 +180,8 @@ suites = {
        TestFile("test_deepseek_v32_basic.py", 275),
    ],
    "per-commit-8-gpu-h20": [
+        TestFile("test_disaggregation_different_tp.py", 600),
+        TestFile("test_disaggregation_pp.py", 140),
        TestFile("quant/test_w4a8_deepseek_v3.py", 371),
    ],
    "vllm_dependency_test": [

--- a/test/srt/test_disaggregation_different_tp.py
+++ b/test/srt/test_disaggregation_different_tp.py
@@ -9,6 +9,7 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST_MLA,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    popen_launch_pd_server,
+    try_cached_model,
 )


@@ -19,7 +20,7 @@ class TestDisaggregationMooncakePrefillLargerTP(TestDisaggregationBase):
        # Temporarily disable JIT DeepGEMM
        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA)

        # Non blocking start servers
        cls.start_prefill()
@@ -90,7 +91,7 @@ class TestDisaggregationMooncakeDecodeLargerTP(TestDisaggregationBase):
        # Temporarily disable JIT DeepGEMM
        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA
+        cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST_MLA)

        # Non blocking start servers
        cls.start_prefill()
@@ -161,7 +162,7 @@ class TestDisaggregationMooncakeMHAPrefillLargerTP(TestDisaggregationBase):
        # Temporarily disable JIT DeepGEMM
        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

        # Non blocking start servers
        cls.start_prefill()
@@ -232,7 +233,7 @@ class TestDisaggregationMooncakeMHADecodeLargerTP(TestDisaggregationBase):
        # Temporarily disable JIT DeepGEMM
        envs.SGLANG_ENABLE_JIT_DEEPGEMM.set(False)

-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

        # Non blocking start servers
        cls.start_prefill()

--- a/test/srt/test_disaggregation_pp.py
+++ b/test/srt/test_disaggregation_pp.py
@@ -8,6 +8,7 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    popen_launch_pd_server,
+    try_cached_model,
 )


@@ -15,7 +16,7 @@ class TestDisaggregationPPAccuracy(TestDisaggregationBase):
    @classmethod
    def setUpClass(cls):
        super().setUpClass()
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.model = try_cached_model(DEFAULT_MODEL_NAME_FOR_TEST)

        # Non blocking start servers
        cls.start_prefill()