[CI] Build aarch64 kernels for sgl-kernel test (#12480)

566ade03 · Baizhou Zhang · GitHub · 69193f71 · 566ade03 · 566ade03
Unverified Commit 566ade03 authored Nov 01, 2025 by Baizhou Zhang Committed by GitHub Nov 01, 2025
Showing with 49 additions and 6 deletions

.github/workflows/pr-test.yml .github/workflows/pr-test.yml +40 -4

scripts/ci/ci_install_dependency.sh scripts/ci/ci_install_dependency.sh +7 -2

sgl-kernel/README.md sgl-kernel/README.md +2 -0

No files found.
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -68,8 +68,9 @@ jobs:
        include:
          - python-version: "3.10"
            cuda-version: "12.9"
-          - python-version: "3.10"
+          # Add back when CUDA 13.0 is supported on CI
-            cuda-version: "13.0"
+          # - python-version: "3.10"
+          #   cuda-version: "13.0"
    name: Build Wheel
    steps:
      - name: Cleanup
@@ -96,6 +97,41 @@ jobs:
          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
          path: sgl-kernel/dist/*
+  sgl-kernel-build-wheels-arm:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.sgl_kernel == 'true'
+    runs-on: arm-kernel-build-node
+    strategy:
+      matrix:
+        include:
+          - python-version: "3.10"
+            cuda-version: "12.9"
+    name: Build Wheel Arm
+    steps:
+      - name: Cleanup
+        run: |
+          sudo rm -rf $GITHUB_WORKSPACE/* || true
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+        run: |
+          cd sgl-kernel
+          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
+          path: sgl-kernel/dist/*
  sgl-kernel-unit-test:
    needs: [check-changes, sgl-kernel-build-wheels]
    if: needs.check-changes.outputs.sgl_kernel == 'true'
@@ -785,7 +821,7 @@ jobs:
          python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
  unit-test-backend-4-gpu-gb200:
-    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
+    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels-arm]
    if: always() && !failure() && !cancelled() &&
      ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
    runs-on: 4-gpu-gb200
@@ -801,7 +837,7 @@ jobs:
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
+          pattern: wheel-python3.10-cuda12.9-aarch64
      - name: Install dependencies
        run: |

--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -105,8 +105,13 @@ echo "SGL_KERNEL_VERSION_FROM_KERNEL=${SGL_KERNEL_VERSION_FROM_KERNEL} SGL_KERNE
 if [ "${CUSTOM_BUILD_SGL_KERNEL:-}" = "true" ]; then
    ls -alh sgl-kernel/dist
-    # TODO: Currently we don't support custom build sgl-kernel for aarch64. To be changed after kernel build for aarch64 is added.
+    # Determine wheel architecture
-    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
+    if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
+        WHEEL_ARCH="aarch64"
+    else
+        WHEEL_ARCH="x86_64"
+    fi
+    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_${WHEEL_ARCH}.whl --force-reinstall $PIP_INSTALL_SUFFIX
 else
    $PIP_CMD install sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} --force-reinstall $PIP_INSTALL_SUFFIX
 fi

--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -105,3 +105,5 @@ m.impl("fwd", torch::kCUDA, make_pytorch_shim(&mha_fwd));
 ## FAQ
 - Q: Segmentation fault with CUDA 12.6
 - A: Update ptxas to 12.8, reference: [segment fault error](https://github.com/Dao-AILab/flash-attention/issues/1453)
+Trigger arm build test (Should be removed later)