[feature] enable NPU CI (#7935)

Co-authored-by: Even Zhou <14368888+iforgetmyname@users.noreply.github.com>

[feature] enable NPU CI (#7935)
Co-authored-by: Even Zhou <14368888+iforgetmyname@users.noreply.github.com>
93d124ef · ronnie_zheng · GitHub · 1fc455e8 · 93d124ef · 93d124ef
Unverified Commit 93d124ef authored Jul 20, 2025 by ronnie_zheng Committed by GitHub Jul 20, 2025
4 changed files
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
+name: PR Test (Ascend NPU)
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/**"
+      - "test/**"
+      - ".github/workflows/pr-test-npu.yml"
+  workflow_dispatch:
+concurrency:
+  group: pr-test-npu-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-test-basic:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-arm64-npu-1
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          bash scripts/npu_ci_install_dependency.sh
+          # copy required dataset file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+      - name: Run test
+        timeout-minutes: 30
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          HF_ENDPOINT: https://hf-mirror.com
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-npu
+  finish:
+    if: always()
+    needs: [ unit-test-basic ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          results=(${{ join(needs.*.result, ' ') }})
+          for result in "${results[@]}"; do
+            if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
+              echo "Job failed with result: $result"
+              exit 1
+            fi
+          done
+          echo "All jobs completed successfully"
+          exit 0
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
    hooks:
      - id: codespell
        additional_dependencies: ['tomli']
-        args: ['--toml', 'python/pyproject.toml']
+        args: ['--toml', 'python/pyproject.toml', '-L', 'cann']
        exclude: test/srt/test_reasoning_parser.py # Exclude the test file that is expected to fail
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v18.1.8

--- a/scripts/npu_ci_install_dependency.sh
+++ b/scripts/npu_ci_install_dependency.sh
+#!/bin/bash
+set -euo pipefail
+# Install the required dependencies in CI.
+sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+apt update -y
+apt install -y build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common
+pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple
+python3 -m pip install --upgrade pip
+pip uninstall sgl-kernel -y || true
+### Download MemFabricV2
+MF_WHL_NAME="mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
+MEMFABRIC_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com:443/sglang/${MF_WHL_NAME}"
+wget "${MEMFABRIC_URL}" && pip install "./${MF_WHL_NAME}"
+### Install vLLM
+VLLM_TAG=v0.8.5
+git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_TAG
+(cd vllm && VLLM_TARGET_DEVICE="empty" pip install -v -e .)
+### Install PyTorch and PTA
+PYTORCH_VERSION=2.6.0
+TORCHVISION_VERSION=0.21.0
+PTA_VERSION=2.6.0rc1
+pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu
+pip install torch_npu==$PTA_VERSION
+### Install Triton-Ascend
+TRITON_ASCEND_VERSION=3.2.0rc2
+pip install attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11
+pip install triton-ascend==$TRITON_ASCEND_VERSION
+pip install -e "python[srt_npu]"
+### Modify PyTorch TODO: to be removed later
+TORCH_LOCATION=$(python3 -c 'import torch; print(torch.__path__[0])')
+sed -i 's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/' "${TORCH_LOCATION}/_inductor/runtime/triton_heuristics.py"
--- a/test/srt/test_ascend_attention_backend.py
+++ b/test/srt/test_ascend_attention_backend.py
@@ -20,22 +20,10 @@ from sglang.test.test_utils import (
    run_bench_offline_throughput,
 )
+DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct"
-class TestAscendAttnBackend(CustomTestCase):
-    def test_latency(self):
-        output_throughput = run_bench_offline_throughput(
-            DEFAULT_MODEL_NAME_FOR_TEST,
-            [
-                "--attention-backend",
-                "ascend",
-            ],
-        )
-        print(f"{output_throughput=}")
-        if is_in_ci():
-            self.assertGreater(output_throughput, 18)
+class TestAscendAttnBackend(CustomTestCase):
    def test_gsm8k(self):
        model = DEFAULT_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_TEST