Unverified Commit cb9574eb authored by Kunshang Ji's avatar Kunshang Ji Committed by GitHub
Browse files

[XPU][9/N] clean up existing ipex code/doc (#34111)


Signed-off-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent 21dfb842
...@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace ...@@ -134,7 +134,6 @@ WORKDIR /vllm-workspace
# Copy test requirements # Copy test requirements
COPY requirements/test.in requirements/cpu-test.in COPY requirements/test.in requirements/cpu-test.in
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN \ RUN \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
remove_packages_not_supported_on_aarch64() { \ remove_packages_not_supported_on_aarch64() { \
......
...@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform. ...@@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform.
# --8<-- [start:requirements] # --8<-- [start:requirements]
- Supported Hardware: Intel Data Center GPU, Intel ARC GPU - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
- OneAPI requirements: oneAPI 2025.1 - OneAPI requirements: oneAPI 2025.3
- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform,
- Python: 3.12 - Python: 3.12
!!! warning !!! warning
The provided IPEX whl is Python3.12 specific so this version is a MUST. The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST.
# --8<-- [end:requirements] # --8<-- [end:requirements]
# --8<-- [start:set-up-using-python] # --8<-- [start:set-up-using-python]
...@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels. ...@@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later. - First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later.
- Second, install Python packages for vLLM XPU backend building: - Second, install Python packages for vLLM XPU backend building:
```bash ```bash
...@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt ...@@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt
- Then, build and install vLLM XPU backend: - Then, build and install vLLM XPU backend:
```bash ```bash
VLLM_TARGET_DEVICE=xpu python setup.py install VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v
``` ```
# --8<-- [end:build-wheel-from-source] # --8<-- [end:build-wheel-from-source]
......
...@@ -17,7 +17,7 @@ DTYPE = ["bfloat16"] ...@@ -17,7 +17,7 @@ DTYPE = ["bfloat16"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE) @pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype): def test_cpu_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm: with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32) output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output assert output
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and inference for quantized HF models supported
on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_ipex_quant.py`.
"""
import pytest
from vllm.platforms import current_platform
MODELS = [
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx
]
DTYPE = ["bfloat16"]
@pytest.mark.skipif(
not current_platform.is_cpu() and not current_platform.is_xpu(),
reason="only supports Intel CPU/XPU backend.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
assert output
print(output)
...@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"): ...@@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"):
return torch.empty((M, N), dtype=input.dtype, device=input.device) return torch.empty((M, N), dtype=input.dtype, device=input.device)
class ipex_ops: class xpu_ops:
@staticmethod @staticmethod
def flash_attn_varlen_func( def flash_attn_varlen_func(
q: torch.Tensor, q: torch.Tensor,
...@@ -73,7 +73,7 @@ class ipex_ops: ...@@ -73,7 +73,7 @@ class ipex_ops:
cu_seqlens_k: torch.Tensor | None = None, cu_seqlens_k: torch.Tensor | None = None,
# passed in qwen vl # passed in qwen vl
dropout_p: float = 0.0, dropout_p: float = 0.0,
# The following parameters are not used in ipex kernel currently, # The following parameters are not used in xpu kernel currently,
# we keep API compatible to CUDA's. # we keep API compatible to CUDA's.
scheduler_metadata=None, scheduler_metadata=None,
fa_version: int = 2, fa_version: int = 2,
...@@ -153,6 +153,6 @@ class ipex_ops: ...@@ -153,6 +153,6 @@ class ipex_ops:
sm_margin=0, # Can be tuned if some SMs are used for communication sm_margin=0, # Can be tuned if some SMs are used for communication
) -> None: ) -> None:
logger.warning_once( logger.warning_once(
"get_scheduler_metadata is not implemented for ipex_ops, returning None." "get_scheduler_metadata is not implemented for xpu_ops, returning None."
) )
return None return None
...@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: ...@@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
logger.info_once("Using Triton backend") logger.info_once("Using Triton backend")
return Mxfp4Backend.TRITON return Mxfp4Backend.TRITON
elif current_platform.is_xpu(): elif current_platform.is_xpu():
logger.info_once("Using ipex marlin backend on XPU") logger.info_once("Using xpu backend on XPU")
return Mxfp4Backend.MARLIN return Mxfp4Backend.MARLIN
elif current_platform.is_rocm() and has_triton_kernels(): elif current_platform.is_rocm() and has_triton_kernels():
logger.info_once("Using Triton backend") logger.info_once("Using Triton backend")
......
...@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager ...@@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
elif current_platform.is_xpu(): elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops as ops from vllm._xpu_ops import xpu_ops as ops
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -345,7 +345,6 @@ class CpuPlatform(Platform): ...@@ -345,7 +345,6 @@ class CpuPlatform(Platform):
ld_preload_str += pytorch_libgomp_so ld_preload_str += pytorch_libgomp_so
os.environ["LD_PRELOAD"] = ld_preload_str os.environ["LD_PRELOAD"] = ld_preload_str
# To hint IPEX uses shared memory based AllReduce
os.environ["LOCAL_WORLD_SIZE"] = str( os.environ["LOCAL_WORLD_SIZE"] = str(
vllm_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.tensor_parallel_size
) )
......
...@@ -23,12 +23,11 @@ if current_platform.is_cuda(): ...@@ -23,12 +23,11 @@ if current_platform.is_cuda():
elif current_platform.is_xpu(): elif current_platform.is_xpu():
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm._xpu_ops import xpu_ops
reshape_and_cache_flash = ops.reshape_and_cache_flash reshape_and_cache_flash = ops.reshape_and_cache_flash
from vllm._ipex_ops import ipex_ops flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func # type: ignore[assignment]
get_scheduler_metadata = xpu_ops.get_scheduler_metadata # type: ignore[assignment]
flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment]
get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment]
elif current_platform.is_rocm(): elif current_platform.is_rocm():
try: try:
from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] from flash_attn import flash_attn_varlen_func # type: ignore[no-redef]
...@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool: ...@@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool:
Platform-specific sources: Platform-specific sources:
- CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func - CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func
- XPU: ipex_ops.flash_attn_varlen_func - XPU: xpu_ops.flash_attn_varlen_func
- ROCm: upstream flash_attn.flash_attn_varlen_func (if available) - ROCm: upstream flash_attn.flash_attn_varlen_func (if available)
Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py) Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py)
......
...@@ -9,7 +9,7 @@ from vllm.platforms import current_platform ...@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
elif current_platform.is_xpu(): elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops as ops # type: ignore[no-redef] from vllm._xpu_ops import xpu_ops as ops # type: ignore[no-redef]
class PagedAttention: class PagedAttention:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment