Unverified Commit 4c03dbaa authored by DiweiSun's avatar DiweiSun Committed by GitHub
Browse files

[CI][XPU]enable sglang CI on Intel XPU (#9493)


Co-authored-by: default avatarhuaiyuzh <huaiyu.zheng@intel.com>
Co-authored-by: default avatarMa Mingfei <mingfei.ma@intel.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent baf277a9
name: PR Test (XPU)
on:
push:
branches: [ main ]
paths:
- "python/**"
- "scripts/ci/**"
- "test/**"
- "sgl-kernel/**"
- ".github/workflows/pr-test-xpu.yml"
pull_request:
branches: [ main ]
paths:
- "python/**"
- "scripts/ci/**"
- "test/**"
- "sgl-kernel/**"
- ".github/workflows/pr-test-xpu.yml"
types: [synchronize, labeled]
workflow_dispatch:
concurrency:
group: pr-test-xpu-${{ github.ref }}
cancel-in-progress: true
jobs:
build-and-test:
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on: intel-bmg
env:
HF_HOME: /home/sdp/.cache/huggingface
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker image
run: |
PR_REPO=${{ github.event.pull_request.head.repo.clone_url }}
PR_HEAD_REF=${{ github.head_ref }}
docker build \
${PR_REPO:+--build-arg SG_LANG_REPO=$PR_REPO} \
${PR_HEAD_REF:+--build-arg SG_LANG_BRANCH=$PR_HEAD_REF} \
--no-cache --progress=plain -f docker/Dockerfile.xpu -t xpu_sglang_main:bmg .
- name: Run container
id: start_container
run: |
container_id=$(docker run -dt \
--group-add 992 \
--group-add $(getent group video | cut -d: -f3) \
-v ${HF_HOME}:/root/.cache/huggingface \
--device /dev/dri \
-e HF_TOKEN="$(cat ~/huggingface_token.txt)" \
xpu_sglang_main:bmg)
echo "Started container: $container_id"
echo "container_id=$container_id" >> "$GITHUB_OUTPUT"
- name: Install Dependency
timeout-minutes: 20
run: |
cid="${{ steps.start_container.outputs.container_id }}"
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install --upgrade pip
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip install pytest expecttest ray huggingface_hub
docker exec "$cid" /home/sdp/miniforge3/envs/py3.10/bin/python3 -m pip uninstall -y flashinfer-python
docker exec "$cid" /bin/bash -c '/home/sdp/miniforge3/envs/py3.10/bin/huggingface-cli login --token ${HF_TOKEN} '
docker exec -u root "$cid" /bin/bash -c "ln -sf /home/sdp/miniforge3/envs/py3.10/bin/python3 /usr/bin/python3"
- name: Run E2E Bfloat16 tests
timeout-minutes: 20
run: |
cid="${{ steps.start_container.outputs.container_id }}"
docker exec -w /home/sdp/sglang/ "$cid" \
bash -c "LD_LIBRARY_PATH=/home/sdp/miniforge3/envs/py3.10/lib:$LD_LIBRARY_PATH && cd ./test/srt && python3 run_suite.py --suite per-commit-xpu"
- name: Cleanup container
if: always()
run: |
cid="${{ steps.start_container.outputs.container_id }}"
docker rm -f "$cid" || true
finish:
if: always()
needs: [build-and-test]
runs-on: ubuntu-latest
steps:
- name: Check job status
run: |
if [ "${{ needs.build-and-test.result }}" != "success" ]; then
echo "Job failed with result: ${{ needs.build-and-test.result }}"
exit 1
fi
echo "All jobs completed successfully"
exit 0
# If the device is Battlemage, we need to set UBUNTU_VERSION to 24.10
# Usage: docker build --build-arg UBUNTU_VERSION=24.04 --build-arg PYTHON_VERSION=3.10 -t sglang:xpu_kernel -f Dockerfile.xpu --no-cache .
# Use Intel deep learning essentials base image with Ubuntu 24.04
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04
# Avoid interactive prompts during package install
ENV DEBIAN_FRONTEND=noninteractive
# Define build arguments
ARG PYTHON_VERSION=3.10
ARG SG_LANG_REPO=https://github.com/sgl-project/sglang.git
ARG SG_LANG_BRANCH=main
ARG SG_LANG_KERNEL_REPO=https://github.com/sgl-project/sgl-kernel-xpu.git
ARG SG_LANG_KERNEL_BRANCH=main
RUN useradd -m -d /home/sdp -s /bin/bash sdp && \
chown -R sdp:sdp /home/sdp
# Switch to non-root user 'sdp'
USER sdp
# Set HOME and WORKDIR to user's home directory
ENV HOME=/home/sdp
WORKDIR /home/sdp
RUN curl -fsSL -v -o miniforge.sh -O https://github.com/conda-forge/miniforge/releases/download/25.1.1-0/Miniforge3-Linux-x86_64.sh && \
bash miniforge.sh -b -p ./miniforge3 && \
rm miniforge.sh && \
# Initialize conda environment and install pip
. ./miniforge3/bin/activate && \
conda create -y -n py${PYTHON_VERSION} python=${PYTHON_VERSION} && \
conda activate py${PYTHON_VERSION} && \
conda install pip && \
# Append environment activation to .bashrc for interactive shells
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; . /opt/intel/oneapi/setvars.sh; cd /home/sdp" >> /home/sdp/.bashrc
USER root
RUN apt-get update && apt install -y intel-ocloc
# Switch back to user sdp
USER sdp
RUN --mount=type=secret,id=github_token \
cd /home/sdp && \
. /home/sdp/miniforge3/bin/activate && \
conda activate py${PYTHON_VERSION} && \
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
RUN --mount=type=secret,id=github_token \
cd /home/sdp && \
. /home/sdp/miniforge3/bin/activate && \
conda activate py${PYTHON_VERSION} && \
echo "Cloning ${SG_LANG_BRANCH} from ${SG_LANG_REPO}" && \
git clone --branch ${SG_LANG_BRANCH} --single-branch ${SG_LANG_REPO} && \
cd sglang && cd python && \
cp pyproject_xpu.toml pyproject.toml && \
pip install . && \
echo "Cloning ${SG_LANG_KERNEL_REPO} from ${SG_LANG_KERNEL_BRANCH}" && \
git clone --branch ${SG_LANG_KERNEL_BRANCH} --single-branch ${SG_LANG_KERNEL_REPO} && \
cd sgl-kernel-xpu && \
pip install -v . && \
pip install msgspec blake3 py-cpuinfo compressed_tensors gguf partial_json_parser einops --root-user-action=ignore && \
pip uninstall pytorch-triton-xpu -y && \
pip install --pre pytorch-triton-xpu --index-url https://download.pytorch.org/whl/xpu && \
conda install libsqlite=3.48.0 -y && \
# Add environment setup commands to .bashrc again (in case it was overwritten)
echo ". /home/sdp/miniforge3/bin/activate; conda activate py${PYTHON_VERSION}; cd /home/sdp" >> /home/sdp/.bashrc
# Use bash as default shell with initialization from .bashrc
SHELL ["bash", "-c"]
# Start an interactive bash shell with all environment set up
USER sdp
CMD ["bash", "-c", "source /home/sdp/.bashrc && exec bash"]
...@@ -17,6 +17,7 @@ from sglang.srt.utils import ( ...@@ -17,6 +17,7 @@ from sglang.srt.utils import (
is_cuda, is_cuda,
is_hip, is_hip,
is_npu, is_npu,
is_xpu,
) )
_is_cuda = is_cuda() _is_cuda = is_cuda()
...@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip ...@@ -25,6 +26,7 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
_is_npu = is_npu() _is_npu = is_npu()
_is_cpu_amx_available = cpu_has_amx_support() _is_cpu_amx_available = cpu_has_amx_support()
_is_cpu = is_cpu() _is_cpu = is_cpu()
_is_xpu = is_xpu()
if _is_cuda: if _is_cuda:
from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace from sgl_kernel import FusedSetKVBufferArg, apply_rope_with_cos_sin_cache_inplace
...@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp): ...@@ -109,8 +111,10 @@ class RotaryEmbedding(CustomOp):
cache = cache.to(dtype) cache = cache.to(dtype)
if ( if (
not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512] (not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512])
) and not (_is_cpu and _is_cpu_amx_available): and not (_is_cpu and _is_cpu_amx_available)
and not _is_xpu
):
from vllm._custom_ops import rotary_embedding from vllm._custom_ops import rotary_embedding
self.vllm_rotary_embedding = rotary_embedding self.vllm_rotary_embedding = rotary_embedding
...@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp): ...@@ -284,6 +288,16 @@ class RotaryEmbedding(CustomOp):
s += f", base={self.base}, is_neox_style={self.is_neox_style}" s += f", base={self.base}, is_neox_style={self.is_neox_style}"
return s return s
def forward_xpu(
self,
positions: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor,
offsets: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
# TODO: make a wrapper, and XPU will implement this kernel later.
return self.forward_native(positions, query, key, offsets)
class LinearScalingRotaryEmbedding(RotaryEmbedding): class LinearScalingRotaryEmbedding(RotaryEmbedding):
"""RotaryEmbedding extended with linear scaling. """RotaryEmbedding extended with linear scaling.
......
...@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8" ...@@ -75,6 +75,11 @@ DEFAULT_MODEL_NAME_FOR_TEST_FP8_WITH_MOE = "gaunernst/DeepSeek-V2-Lite-Chat-FP8"
DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8" DEFAULT_MODEL_NAME_FOR_TEST_W8A8 = "RedHatAI/Llama-3.2-3B-quantized.w8a8"
DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8" DEFAULT_MODEL_NAME_FOR_TEST_W8A8_WITH_MOE = "nytopop/Qwen3-30B-A3B.w8a8"
# INT4 models
DEFAULT_MODEL_NAME_FOR_TEST_AWQ_INT4 = (
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
)
# EAGLE # EAGLE
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf" DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B" DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
......
...@@ -316,6 +316,13 @@ suite_xeon = { ...@@ -316,6 +316,13 @@ suite_xeon = {
], ],
} }
# Add Intel XPU tests
suite_xpu = {
"per-commit-xpu": [
TestFile("xpu/test_intel_xpu_backend.py"),
],
}
# Add Ascend NPU tests # Add Ascend NPU tests
# NOTE: please sort the test cases alphabetically by the test file name # NOTE: please sort the test cases alphabetically by the test file name
suite_ascend = { suite_ascend = {
...@@ -341,6 +348,7 @@ suite_ascend = { ...@@ -341,6 +348,7 @@ suite_ascend = {
suites.update(suite_amd) suites.update(suite_amd)
suites.update(suite_xeon) suites.update(suite_xeon)
suites.update(suite_ascend) suites.update(suite_ascend)
suites.update(suite_xpu)
def auto_partition(files, rank, size): def auto_partition(files, rank, size):
......
"""
Usage:
python3 -m unittest test_intel_xpu_backend.TestIntelXPUBackend.test_latency_qwen_model
"""
import os
import unittest
from functools import wraps
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN,
CustomTestCase,
is_in_ci,
run_bench_one_batch,
)
def intel_xpu_benchmark(extra_args=None, min_throughput=None):
def decorator(test_func):
@wraps(test_func)
def wrapper(self):
common_args = [
"--disable-radix",
"--trust-remote-code",
"--mem-fraction-static",
"0.3",
"--batch-size",
"1",
"--device",
"xpu",
]
full_args = common_args + (extra_args or [])
model = test_func(self)
prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
model, full_args
)
print(f"{model=}")
print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")
if is_in_ci() and min_throughput is not None:
self.assertGreater(decode_throughput, min_throughput)
return wrapper
return decorator
class TestIntelXPUBackend(CustomTestCase):
@intel_xpu_benchmark(min_throughput=10)
def test_latency_qwen_model(self):
return DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment