[CI][XPU]enable sglang CI on Intel XPU (#9493)

Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

[CI][XPU]enable sglang CI on Intel XPU (#9493)
Co-authored-by: huaiyuzh <huaiyu.zheng@intel.com> Co-authored-by: Ma Mingfei <mingfei.ma@intel.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
4c03dbaa · DiweiSun · GitHub · baf277a9 · 4c03dbaa · 4c03dbaa
Unverified Commit 4c03dbaa authored Oct 16, 2025 by DiweiSun Committed by GitHub Oct 15, 2025
6 changed files
--- a/.github/workflows/pr-test-xpu.yml
+++ b/.github/workflows/pr-test-xpu.yml
+name: PR Test (XPU)
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xpu.yml"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/**"
+      - "scripts/ci/**"
+      - "test/**"
+      - "sgl-kernel/**"
+      - ".github/workflows/pr-test-xpu.yml"
+    types: [synchronize, labeled]
+  workflow_dispatch:
+concurrency:
+  group: pr-test-xpu-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build-and-test:
+    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
+    runs-on: intel-bmg
+    env:
+      HF_HOME: /home/sdp/.cache/huggingface
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build Docker image
+        run: |
+          PR_REPO=${{ github.event.pull_request.head.repo.clone_url }}
+          PR_HEAD_REF=${{ github.head_ref }}
+          docker build \
+            ${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \
+            ${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \
+            --no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg .
+      - name: Run container
+        id: start_container
+        run: |
+          container_id=$(docker run -dt \
+            --group-add 992 \
+            --group-add $(getent group video | cut -d: -f3) \
+            -v ${HF_HOME}:/root/.cache/huggingface \
+            --device /dev/dri \
+            -e HF_TOKEN="$(cat ~/huggingface_token.txt)" \
+            xpu_sglang_main:bmg)
+          echo "Started container: $container_id"
+          echo "container_id=$container_id" >> "$GITHUB_OUTPUT"
+      - name: Install Dependency
+        timeout-minutes: 20
+        run: |
+          cid="${{ steps.start_container.outputs.container_id }}"
+          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip
+          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub
+          docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python
+          docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} '
+          docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3"
+      - name: Run E2E Bfloat16 tests
+        timeout-minutes: 20
+        run: |
+          cid="${{ steps.start_container.outputs.container_id }}"
+          docker exec -w /home/sdp/sglang/ "$cid" \
+            bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu"
+      - name: Cleanup container
+        if: always()
+        run: |
+          cid="${{ steps.start_container.outputs.container_id }}"
+          docker rm -f "$cid" || true
+  finish:
+    if: always()
+    needs: [build-and-test]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check job status
+        run: |
+          if [ "${{ needs.build-and-test.result }}" != "success" ]; then
+            echo "Job failed with result: ${{ needs.build-and-test.result }}"
+            exit 1
+          fi
+          echo "All jobs completed successfully"
+          exit 0
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
+# If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10
+# Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f  Dockerfile.xpu --no-cache .
+# Use Intel deep learning essentials base image with Ubuntu 24.04
+FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
+# Avoid interactive prompts during package install
+ENV DEBIAN_FRONTEND=noninteractive
+# Define build arguments
+ARG PYTHON_VERSION=3.10
+ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git
+ARG SG_LANG_BRANCH=main
+ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git
+ARG SG_LANG_KERNEL_BRANCH=main
+RUN useradd -m -d /home/sdp -s /bin/bash sdp && \
+    chown -R sdp:sdp /home/sdp
+# Switch to non-root user 'sdp'
+USER sdp
+# Set HOME and WORKDIR to user's home directory
+ENV HOME=/home/sdp
+WORKDIR /home/sdp
+RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \
+    bash miniforge.sh -b -p ./miniforge3 && \
+    rm miniforge.sh && \
+    # Initialize conda environment and install pip
+    . ./miniforge3/bin/activate && \
+    conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \
+    conda activate py${PYTHON_VERSION} && \
+    conda install pip && \
+    # Append environment activation to .bashrc for interactive shells
+    echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc
+USER root
+RUN apt-get update && apt install -y intel-ocloc
+# Switch back to user sdp
+USER sdp
+RUN --mount=type=secret,id=github_token \
+    cd /home/sdp && \
+    . /home/sdp/miniforge3/bin/activate && \
+    conda activate py${PYTHON_VERSION} && \
+    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+RUN --mount=type=secret,id=github_token \
+    cd /home/sdp && \
+    . /home/sdp/miniforge3/bin/activate && \
+    conda activate py${PYTHON_VERSION} && \
+    echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
+    git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
+    cd sglang && cd python && \
+    cp pyproject_xpu.toml pyproject.toml && \
+    pip install . && \
+    echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
+    git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
+    cd sgl-kernel-xpu && \
+    pip install -v . && \
+    pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
+    pip uninstall pytorch-triton-xpu -y && \
+    pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
+    conda install libsqlite=3.48.0 -y && \
+    # Add environment setup commands to .bashrc again (in case it was overwritten)
+    echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
+# Use bash as default shell with initialization from .bashrc
+SHELL ["bash", "-c"]
+# Start an interactive bash shell with all environment set up
+USER sdp
+CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"]
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -17,6 +17,7 @@ from sglang.srt.utils import (
    is_cuda,
    is_hip,
    is_npu,
+    is_xpu,
 )
 _is_cuda = is_cuda()
@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 _is_npu = is_npu()
 _is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_xpu = is_xpu()
 if _is_cuda:
    from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
            cache = cache.to(dtype)
        if (
-            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+            (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
-        ) and not (_is_cpu and _is_cpu_amx_available):
+            and not (_is_cpu and _is_cpu_amx_available)
+            and not _is_xpu
+        ):
            from vllm._custom_ops import rotary_embedding
            self.vllm_rotary_embedding = rotary_embedding
@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
        return s
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # TODO: make a wrapper, and XPU will implement this kernel later.
+        return self.forward_native(positions, query, key, offsets)
 class LinearScalingRotaryEmbedding(RotaryEmbedding):
    """RotaryEmbedding extended with linear scaling.

--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
 DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
+# INT4 models
+DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
+)
 # EAGLE
 DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
 DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -316,6 +316,13 @@ suite_xeon = {
    ],
 }
+# Add Intel XPU tests
+suite_xpu = {
+    "per-commit-xpu": [
+        TestFile("xpu/test_intel_xpu_backend.py"),
+    ],
+}
 # Add Ascend NPU tests
 # NOTE: please sort the test cases alphabetically by the test file name
 suite_ascend = {
@@ -341,6 +348,7 @@ suite_ascend = {
 suites.update(suite_amd)
 suites.update(suite_xeon)
 suites.update(suite_ascend)
+suites.update(suite_xpu)
 def auto_partition(files, rank, size):

--- a/test/srt/xpu/test_intel_xpu_backend.py
+++ b/test/srt/xpu/test_intel_xpu_backend.py
+"""
+Usage:
+python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
+"""
+import os
+import unittest
+from functools import wraps
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
+    CustomTestCase,
+    is_in_ci,
+    run_bench_one_batch,
+)
+def intel_xpu_benchmark(extra_args=None, min_throughput=None):
+    def decorator(test_func):
+        @wraps(test_func)
+        def wrapper(self):
+            common_args = [
+                "--disable-radix",
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.3",
+                "--batch-size",
+                "1",
+                "--device",
+                "xpu",
+            ]
+            full_args = common_args + (extra_args or [])
+            model = test_func(self)
+            prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
+                model, full_args
+            )
+            print(f"{model=}")
+            print(f"{prefill_latency=}")
+            print(f"{decode_throughput=}")
+            print(f"{decode_latency=}")
+            if is_in_ci() and min_throughput is not None:
+                self.assertGreater(decode_throughput, min_throughput)
+        return wrapper
+    return decorator
+class TestIntelXPUBackend(CustomTestCase):
+    @intel_xpu_benchmark(min_throughput=10)
+    def test_latency_qwen_model(self):
+        return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
+if __name__ == "__main__":
+    unittest.main()