Unverified Commit ff47aab0 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[CPU] Upgrade CPU backend to torch-2.6 (#13381)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
Co-authored-by: default avatarIsotr0py <2037008807@qq.com>
parent debd6bbf
...@@ -19,13 +19,14 @@ remove_docker_container ...@@ -19,13 +19,14 @@ remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2 export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference # offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
...@@ -36,6 +37,7 @@ function cpu_tests() { ...@@ -36,6 +37,7 @@ function cpu_tests() {
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e set -e
pip install -r vllm/requirements/test.txt pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model
...@@ -85,4 +87,4 @@ function cpu_tests() { ...@@ -85,4 +87,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
...@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li ...@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.5.0 RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace WORKDIR /workspace
......
...@@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) ...@@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare( FetchContent_Declare(
oneDNN oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.6 GIT_TAG v3.7.1
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
GIT_SHALLOW TRUE GIT_SHALLOW TRUE
) )
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
-r common.txt -r common.txt
# Dependencies for CPUs # Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x" torch==2.6.0+cpu; platform_machine == "x86_64"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.7.0.dev20250304; platform_machine == "s390x" torch==2.7.0.dev20250304; platform_machine == "s390x"
......
...@@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest ...@@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform from vllm.platforms import current_platform
@pytest.fixture(autouse=True) @pytest.fixture(autouse=not current_platform.is_cpu())
def v1(run_with_both_engines_lora): def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test # Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every # This can be promoted up to conftest.py to run for every
......
...@@ -17,7 +17,7 @@ class _PagedAttention: ...@@ -17,7 +17,7 @@ class _PagedAttention:
@staticmethod @staticmethod
def get_supported_head_sizes() -> List[int]: def get_supported_head_sizes() -> List[int]:
return [32, 64, 80, 96, 112, 128, 256] return [32, 64, 80, 96, 112, 128, 192, 256]
@staticmethod @staticmethod
def get_kv_cache_shape( def get_kv_cache_shape(
......
...@@ -254,6 +254,7 @@ def _run_worker_process( ...@@ -254,6 +254,7 @@ def _run_worker_process(
# online (in situ) tuning is enabled. # online (in situ) tuning is enabled.
# Offline tuning API (record_untuned_is_enabled()) only # Offline tuning API (record_untuned_is_enabled()) only
# available in PyTorch 2.6 or later. # available in PyTorch 2.6 or later.
if torch.cuda.is_available():
import torch.cuda.tunable as tunable import torch.cuda.tunable as tunable
if (tunable.is_enabled() and tunable.tuning_is_enabled() if (tunable.is_enabled() and tunable.tuning_is_enabled()
and not tunable.record_untuned_is_enabled()): and not tunable.record_untuned_is_enabled()):
......
...@@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
global_num_experts: int = -1, global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None, expert_map: Optional[torch.Tensor] = None,
custom_routing_function: Optional[Callable] = None, custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
activation: str = "silu", activation: str = "silu",
**kwargs, **kwargs,
): ):
assert custom_routing_function is None
assert activation == "silu", f"{activation} is not supported." assert activation == "silu", f"{activation} is not supported."
return layer.ipex_fusion( return layer.ipex_fusion(
x, x,
...@@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
renormalize, renormalize,
topk_group, topk_group,
num_expert_group, num_expert_group,
custom_routing_function,
scoring_func,
e_score_correction_bias,
) )
def forward_tpu( def forward_tpu(
......
...@@ -121,6 +121,9 @@ class CpuPlatform(Platform): ...@@ -121,6 +121,9 @@ class CpuPlatform(Platform):
# Disable torch async compiling which won't work with daemonic processes # Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
# MLA attention is not supported
os.environ["VLLM_MLA_DISABLE"] = "1"
# Intel OpenMP setting # Intel OpenMP setting
ld_prealod_str = os.getenv("LD_PRELOAD", "") ld_prealod_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_prealod_str: if "libiomp5.so" in ld_prealod_str:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment