[CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
7721ef17 · Li, Jiang · GitHub · 8369b7c2 · 7721ef17 · 7721ef17
Unverified Commit 7721ef17 authored Jul 08, 2025 by Li, Jiang Committed by GitHub Jul 07, 2025
9 changed files
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -48,10 +48,16 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # Note: disable until supports V1
-    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    pytest -v -s tests/models/language/generation -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
+    # Note: disable Bart until supports V1
+    pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+                --ignore=tests/models/language/generation/test_bart.py
    pytest -v -s tests/models/language/pooling -m cpu_model
    pytest -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,21 +68,15 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
  #   VLLM_USE_V1=0 pytest -s -v \
  #   tests/quantization/test_ipex_quant.py"
-  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
-    set -e
-    pytest -s -v -k cpu_model \
-    tests/basic_correctness/test_chunked_prefill.py"  
  # online serving
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -294,61 +294,3 @@ def test_with_prefix_caching(
        name_0="w/o prefix caching",
        name_1="with prefix caching",
    )
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
-@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
-def test_models_cpu(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    test_models(
-        hf_runner,
-        vllm_runner,
-        example_prompts,
-        model,
-        dtype,
-        max_tokens,
-        chunked_prefill_token_size,
-        enforce_eager,
-        1,
-        attention_backend,
-        monkeypatch,
-    )
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
-@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
-def test_with_prefix_caching_cpu(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    dtype: str,
-) -> None:
-    test_with_prefix_caching(
-        vllm_runner,
-        max_tokens,
-        enforce_eager,
-        chunk_size,
-        1,
-        dtype,
-    )
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -39,7 +39,7 @@ AITER_MODEL_LIST = [
    [
        pytest.param(
            "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            "openai-community/gpt2",  # gpt2
@@ -87,7 +87,11 @@ AITER_MODEL_LIST = [
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
            "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            marks=[pytest.mark.cpu_model],
        )
    ])
 @pytest.mark.parametrize("max_tokens", [32])

--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 from typing import Optional
 import pytest
@@ -29,8 +28,10 @@ def v1(run_with_both_engines):
        # [Decoder-only]
        pytest.param("BAAI/bge-multilingual-gemma2",
                     marks=[pytest.mark.core_model]),
-        pytest.param("intfloat/e5-mistral-7b-instruct",
+        pytest.param(
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+            "intfloat/e5-mistral-7b-instruct",
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
        # the qwen models interfere with each other (see PR
        # https://github.com/vllm-project/vllm/pull/18720).
        # To avoid this problem, for now we skip v0 since it will be
@@ -38,9 +39,11 @@ def v1(run_with_both_engines):
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
            marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
+                # CPU only supports V1
+                pytest.mark.core_model,
                pytest.mark.skip_v1
            ]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
@@ -61,10 +64,6 @@ def test_models(
    model,
    monkeypatch,
 ) -> None:
-    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
-    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
-        pytest.skip("CPU V1 doesn't support sliding window")
    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend

--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import pytest
 import torch
 import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
    dtype: str,
    monkeypatch,
 ) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
    if current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -45,6 +45,7 @@ def use_v0_only(monkeypatch):
    """
    This module relies on V0 internals, so set VLLM_USE_V1=0.
    """
+    if not current_platform.is_cpu():
        monkeypatch.setenv('VLLM_USE_V1', '0')

--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import List, Optional, Tuple
-try:
-    import intel_extension_for_pytorch.llm.modules as ipex_modules
-    _use_ipex = True
-# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
-except (ImportError, AttributeError):
-    _use_ipex = False
-import torch
-from vllm import _custom_ops as ops
-class _PagedAttention:
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [32, 64, 80, 96, 112, 128, 192, 256]
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[int, ...]:
-        return 2, num_blocks, block_size * num_kv_heads * head_size
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        x = 16 // kv_cache.element_size()
-        num_blocks = kv_cache.shape[1]
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x,
-                                   -1, x)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1)
-        return key_cache, value_cache
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ops.reshape_and_cache(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping.flatten(),
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-        )
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        tp_rank: int = 0
-        blocksparse_local_blocks: int = 0
-        blocksparse_vert_stride: int = 0
-        blocksparse_block_size: int = 64
-        blocksparse_head_sliding_step: int = 0
-        block_size = value_cache.shape[3]
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            context_lens,
-            block_size,
-            max_context_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank,
-            blocksparse_local_blocks,
-            blocksparse_vert_stride,
-            blocksparse_block_size,
-            blocksparse_head_sliding_step,
-        )
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-        *args,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-class _IPEXPagedAttention(_PagedAttention):
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-        *args,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        num_blocks = kv_cache.shape[1]
-        key_cache = kv_cache[0]
-        key_cache = key_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        value_cache = kv_cache[1]
-        value_cache = value_cache.view(num_blocks, num_kv_heads, -1, head_size)
-        return key_cache, value_cache
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        ipex_modules.PagedAttention.reshape_and_cache(
-            key, value, key_cache, value_cache,
-            slot_mapping.flatten().int())
-    @staticmethod
-    def forward_decode(
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        *args,
-    ) -> None:
-        block_size = value_cache.shape[2]
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            device="cpu",
-            dtype=torch.int32,
-        ).view(num_kv_heads,
-               1).repeat_interleave(query.size(1) // num_kv_heads).flatten()
-        ipex_modules.PagedAttention.single_query_cached_kv_attention(
-            output, query.contiguous(), key_cache, value_cache, head_mapping,
-            scale, block_tables, context_lens, block_size, max_context_len,
-            alibi_slopes)
-PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py