[NVIDIA] Add CI workloads for GB200 (#12242)

c0d02cf4 · Kaixi Hou · GitHub · 7d121448 · c0d02cf4 · c0d02cf4
Unverified Commit c0d02cf4 authored Oct 30, 2025 by Kaixi Hou Committed by GitHub Oct 30, 2025
5 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -784,6 +784,35 @@ jobs:
          cd test/srt
          python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
+  unit-test-backend-4-gpu-gb200:
+    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
+    if: always() && !failure() && !cancelled() &&
+      ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+    runs-on: 4-gpu-gb200
+    strategy:
+      fail-fast: false
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
+      - name: Run test
+        timeout-minutes: 45
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu-gb200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
  pr-test-finish:
    needs:
      [
@@ -808,6 +837,7 @@ jobs:
        unit-test-deepep-4-gpu,
        unit-test-deepep-8-gpu,
        unit-test-backend-4-gpu-b200,
+        unit-test-backend-4-gpu-gb200,
      ]
    if: always()
    runs-on: ubuntu-latest

--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -5,6 +5,10 @@ set -euxo pipefail
 IS_BLACKWELL=${IS_BLACKWELL:-0}
 CU_VERSION="cu129"
+# Detect system architecture
+ARCH=$(uname -m)
+echo "Detected architecture: ${ARCH}"
 if [ "$CU_VERSION" = "cu130" ]; then
    NVRTC_SPEC="nvidia-cuda-nvrtc"
 else
@@ -23,8 +27,20 @@ rm -rf /root/.cache/flashinfer
 # Install apt packages
 apt install -y git libnuma-dev libssl-dev pkg-config
+# Check if protoc of correct architecture is already installed
+if command -v protoc >/dev/null 2>&1; then
+    if protoc --version >/dev/null 2>&1; then
+        echo "protoc already installed: $(protoc --version)"
+    else
+        echo "protoc found but not runnable, reinstalling..."
+        INSTALL_PROTOC=1
+    fi
+else
+    INSTALL_PROTOC=1
+fi
 # Install protoc for router build (gRPC protobuf compilation)
-if ! command -v protoc &> /dev/null; then
+if [ "${INSTALL_PROTOC:-0}" = "1" ]; then
    echo "Installing protoc..."
    if command -v apt-get &> /dev/null; then
        # Ubuntu/Debian
@@ -37,9 +53,16 @@ if ! command -v protoc &> /dev/null; then
    fi
    cd /tmp
-    wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/protoc-32.0-linux-x86_64.zip
+    # Determine protoc architecture
-    unzip protoc-32.0-linux-x86_64.zip -d /usr/local
+    if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
-    rm protoc-32.0-linux-x86_64.zip
+        PROTOC_ARCH="aarch_64"
+    else
+        PROTOC_ARCH="x86_64"
+    fi
+    PROTOC_ZIP="protoc-32.0-linux-${PROTOC_ARCH}.zip"
+    wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/${PROTOC_ZIP}
+    unzip -o ${PROTOC_ZIP} -d /usr/local
+    rm ${PROTOC_ZIP}
    protoc --version
    cd -
 else
@@ -52,6 +75,7 @@ if [ "$IS_BLACKWELL" = "1" ]; then
    # so we can only use pip with `--break-system-packages`
    PIP_CMD="pip"
    PIP_INSTALL_SUFFIX="--break-system-packages"
+    $PIP_CMD install --upgrade pip
    # Clean up existing installations
    $PIP_CMD uninstall -y sgl-kernel sglang $PIP_INSTALL_SUFFIX || true
@@ -81,7 +105,13 @@ echo "SGL_KERNEL_VERSION_FROM_KERNEL=${SGL_KERNEL_VERSION_FROM_KERNEL} SGL_KERNE
 if [ "${CUSTOM_BUILD_SGL_KERNEL:-}" = "true" ]; then
    ls -alh sgl-kernel/dist
-    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
+    # Determine wheel architecture
+    if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
+        WHEEL_ARCH="aarch64"
+    else
+        WHEEL_ARCH="x86_64"
+    fi
+    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_${WHEEL_ARCH}.whl --force-reinstall $PIP_INSTALL_SUFFIX
 else
    $PIP_CMD install sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} --force-reinstall $PIP_INSTALL_SUFFIX
 fi

--- a/scripts/ci_monitor/ci_analyzer.py
+++ b/scripts/ci_monitor/ci_analyzer.py
@@ -112,6 +112,9 @@ class SGLangCIAnalyzer:
            "b200": [
                "unit-test-backend-4-gpu-b200",
            ],
+            "gb200": [
+                "unit-test-backend-4-gpu-gb200",
+            ],
        }
        stats = {
@@ -180,6 +183,7 @@ class SGLangCIAnalyzer:
                    "unit-test-deepep-8-gpu",
                    "unit-test-backend-8-gpu-deepseek-v32",
                    "unit-test-backend-4-gpu-b200",
+                    "unit-test-backend-4-gpu-gb200",
                    "vllm-dependency-test",
                    "nightly-test-eval-text-models",
                    "nightly-test-perf-text-models",

--- a/scripts/ci_monitor/ci_analyzer_balance.py
+++ b/scripts/ci_monitor/ci_analyzer_balance.py
@@ -181,6 +181,7 @@ class SGLangTestBalanceAnalyzer:
            "unit-test-backend-8-gpu-h200",
            "unit-test-backend-8-gpu-h20",
            "unit-test-backend-4-gpu-b200",
+            "unit-test-backend-4-gpu-gb200",
            "unit-test-deepep-4-gpu",
            "unit-test-deepep-8-gpu",
            "unit-test-backend-8-gpu-deepseek-v32",

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -185,6 +185,9 @@ suites = {
        TestFile("test_gpt_oss_4gpu.py", 600),
        TestFile("test_llama31_fp4.py", 300),
    ],
+    "per-commit-4-gpu-gb200": [
+        TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
+    ],
    "per-commit-4-gpu-deepep": [
        TestFile("ep/test_deepep_small.py", 531),
        TestFile("ep/test_mooncake_ep_small.py", 450),