[CPU] Refactor CPU attention backend (#27954)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CPU] Refactor CPU attention backend (#27954)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
7f829be7 · Li, Jiang · GitHub · e1710393 · 7f829be7 · 7f829be7
Unverified Commit 7f829be7 authored Nov 12, 2025 by Li, Jiang Committed by GitHub Nov 12, 2025
14 changed files
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -35,7 +35,7 @@ DEVICE_MLA_BACKENDS = {
 DEVICE_REGULAR_ATTN_BACKENDS = {
    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
    "hip": ["ROCM_ATTN"],
-    "cpu": ["TORCH_SDPA"],
+    "cpu": ["CPU_ATTN"],
 }
 DEVICE_MLA_BLOCK_SIZES = {
@@ -86,7 +86,7 @@ def test_env(
        if device == "cpu":
            with patch("vllm.platforms.current_platform", CpuPlatform()):
                backend = get_attn_backend(16, torch.float16, None, block_size)
-            assert backend.get_name() == "TORCH_SDPA"
+            assert backend.get_name() == "CPU_ATTN"
        elif device == "hip":
            with patch("vllm.platforms.current_platform", RocmPlatform()):
@@ -224,7 +224,7 @@ def test_fp32_fallback(device: str):
    if device == "cpu":
        with patch("vllm.platforms.current_platform", CpuPlatform()):
            backend = get_attn_backend(16, torch.float32, None, 16)
-        assert backend.get_name() == "TORCH_SDPA"
+        assert backend.get_name() == "CPU_ATTN"
    elif device == "cuda":
        with patch("vllm.platforms.current_platform", CudaPlatform()):

--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Integration tests for FlexAttention backend vs default backend"""
 import pytest
 import torch

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -38,7 +38,11 @@ AITER_MODEL_LIST = [
    [
        pytest.param(
            "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.slow_test],
+            marks=[
+                pytest.mark.core_model,
+                pytest.mark.slow_test,
+                pytest.mark.cpu_model,
+            ],
        ),
        pytest.param(
            "openai-community/gpt2",  # gpt2
@@ -55,6 +59,10 @@ AITER_MODEL_LIST = [
                pytest.mark.slow_test,
            ],
        ),
+        pytest.param(
+            "google/gemma-2-2b-it",  # test hybrid attention
+            marks=[pytest.mark.cpu_model],
+        ),
        pytest.param(
            "zai-org/chatglm3-6b",  # chatglm (text-only)
        ),
@@ -64,7 +72,6 @@ AITER_MODEL_LIST = [
        ),
        pytest.param(
            "openbmb/MiniCPM3-4B",
-            # fused_moe not supported on CPU
            marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
        ),
        pytest.param(
@@ -93,11 +100,7 @@ AITER_MODEL_LIST = [
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
            "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model],
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-        ),
-        pytest.param(
-            "allenai/OLMoE-1B-7B-0924-Instruct",
-            marks=[pytest.mark.cpu_model],
        ),
        pytest.param("swiss-ai/Apertus-8B-Instruct-2509"),  # apertus
    ],

--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -23,8 +23,7 @@ from ...utils import check_embeddings_close
        ),
        pytest.param(
            "intfloat/e5-mistral-7b-instruct",
-            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            "ssmits/Qwen2-7B-Instruct-embed-base", marks=[pytest.mark.cpu_model]

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -243,7 +243,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "FalconH1ForCausalLM": _HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"),
    "FlexOlmoForCausalLM": _HfExamplesInfo("allenai/Flex-reddit-2x7B-1T"),
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
-    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma2ForCausalLM": _HfExamplesInfo(
+        "google/gemma-2-9b", extras={"tiny": "google/gemma-2-2b-it"}
+    ),
    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
    "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it"),
    "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"),

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2583,6 +2583,88 @@ def onednn_scaled_mm(
    return output
+def cpu_attn_get_scheduler_metadata(
+    num_reqs: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_dim: int,
+    seq_lens: torch.Tensor,
+    dtype: torch.dtype,
+    query_start_loc: torch.Tensor,
+    causal: bool,
+    sliding_window_size: int,
+    isa: str,
+    enable_kv_split: bool,
+) -> torch.Tensor:
+    sheduler_metadata = torch.ops._C.get_scheduler_metadata(
+        num_reqs,
+        num_heads,
+        num_kv_heads,
+        head_dim,
+        seq_lens,
+        dtype,
+        query_start_loc,
+        causal,
+        sliding_window_size,
+        isa,
+        enable_kv_split,
+    )
+    return sheduler_metadata
+def cpu_attn_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    isa: str,
+) -> None:
+    torch.ops._C.cpu_attn_reshape_and_cache(
+        key,
+        value,
+        key_cache,
+        value_cache,
+        slot_mapping,
+        isa,
+    )
+def cpu_attention_with_kv_cache(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    output: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    scale: float,
+    causal: bool,
+    alibi_slopes: torch.Tensor | None,
+    sliding_window: tuple[int, int],
+    block_table: torch.Tensor,
+    softcap: float,
+    scheduler_metadata: torch.Tensor,
+    s_aux: torch.Tensor | None,
+) -> None:
+    torch.ops._C.cpu_attention_with_kv_cache(
+        query,
+        key_cache,
+        value_cache,
+        output,
+        query_start_loc,
+        seq_lens,
+        scale,
+        causal,
+        alibi_slopes,
+        sliding_window[0],
+        sliding_window[1],
+        block_table,
+        softcap,
+        scheduler_metadata,
+        s_aux,
+    )
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
    @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")

--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -49,7 +49,7 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
    ROCM_AITER_FA = (
        "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
    )
-    TORCH_SDPA = "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend"
+    TORCH_SDPA = ""  # this tag is only used for ViT
    FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
    FLASHINFER_MLA = (
        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend"
@@ -70,6 +70,7 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
        "vllm.v1.attention.backends.rocm_aiter_unified_attn."
        "RocmAiterUnifiedAttentionBackend"
    )
+    CPU_ATTN = "vllm.v1.attention.backends.cpu_attn.CPUAttentionBackend"
    # Placeholder for third-party/custom backends - must be registered before use
    CUSTOM = ""

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1726,9 +1726,6 @@ class EngineArgs:
                )
                _raise_unsupported_error(feature_name=name)
-        if current_platform.is_cpu() and model_config.get_sliding_window() is not None:
-            _raise_unsupported_error(feature_name="sliding window (CPU backend)")
    def _set_default_args(
        self, usage_context: UsageContext, model_config: ModelConfig
    ) -> None:

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -8,7 +8,6 @@ import platform
 import subprocess
 import sys
 from dataclasses import dataclass
-from importlib.util import find_spec
 from typing import TYPE_CHECKING
 import regex as re
@@ -139,16 +138,15 @@ class CpuPlatform(Platform):
    ) -> str:
        from vllm.attention.backends.registry import AttentionBackendEnum
-        if selected_backend and selected_backend != AttentionBackendEnum.TORCH_SDPA:
+        if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
            logger.info("Cannot use %s backend on CPU.", selected_backend)
        if use_mla:
            raise NotImplementedError("MLA is not supported on CPU.")
        if use_sparse:
            raise NotImplementedError("Sparse Attention is not supported on CPU.")
-        logger.info("Using Torch SDPA backend.")
        if not use_v1:
            raise ValueError("CPU backend only supports V1.")
-        return AttentionBackendEnum.TORCH_SDPA.get_path()
+        return AttentionBackendEnum.CPU_ATTN.get_path()
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -186,15 +184,13 @@ class CpuPlatform(Platform):
        cache_config = vllm_config.cache_config
-        ipex_available = find_spec("intel_extension_for_pytorch") is not None
+        if cache_config.block_size is None:
+            cache_config.block_size = 128
-        if cache_config and cache_config.block_size is None:
+        if cache_config.block_size % 32 != 0:
-            cache_config.block_size = 128 if ipex_available else 16
+            logger.warning(
+                "CPU backend prefers block_size is multiples of 32, "
-        if not ipex_available and cache_config.block_size != 16:
+                "otherwise the performance is not optimized."
-            raise RuntimeError(
-                f"--block-size={cache_config.block_size} requires"
-                " intel_extension_for_pytorch"
            )
        scheduler_config = vllm_config.scheduler_config
@@ -207,22 +203,11 @@ class CpuPlatform(Platform):
                "backend is not compatible with FP8 KV cache."
            )
-        if cache_config.cache_dtype == "fp8_e4m3":
+        if cache_config.cache_dtype != "auto":
-            cache_config.cache_dtype = "fp8_e5m2"
-            logger.warning(
-                "CPU backend doesn't support fp8_e4m3 KV cache type, cast to fp8_e5m2."
-            )
-        if (
-            cache_config.cache_dtype != "auto"
-            and model_config is not None
-            and model_config.dtype == torch.half
-        ):
            logger.warning(
-                "FP8 KV cache on the CPU backend only does not"
+                "CPU backend doesn't support KV cache quantization fallback to auto."
-                " support fp16 for now, cast to bf16."
            )
-            model_config.dtype = torch.bfloat16
+            cache_config.cache_dtype = "auto"
        cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -57,7 +57,6 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
 # Possible string values of STR_BACKEND_ENV_VAR
 # register, corresponding to possible backends
 STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA"
 STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
 STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
 STR_INVALID_VAL: str = "INVALID"

--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -265,7 +265,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
    def _init_reorder_batch_threshold(
        self,
-        reorder_batch_threshold: int = 1,
+        reorder_batch_threshold: int | None = 1,
        supports_spec_as_decode: bool = False,
        supports_dcp_with_varlen: bool = False,
    ) -> None:

--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import Any
 import torch
 import torch.nn as nn
@@ -12,9 +12,6 @@ from vllm.model_executor.model_loader import get_model
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
 logger = init_logger(__name__)
@@ -31,15 +28,6 @@ class CPUModelRunner(GPUModelRunner):
        self._postprocess_tensors()
-    # Note: Remove the override after new attention backend finished
-    def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
-        if len(self.kv_cache_config.kv_cache_groups) > 1:
-            raise ValueError(
-                "Multiple KVCacheGroups is not"
-                "currently supported with CPU model runner."
-            )
-        super()._may_reorder_batch(scheduler_output)
    def _postprocess_tensors(self) -> None:
        # Note: replace device tensors with cpu tensors
        def replace_tensor(obj: Any, cpu_attr_name: str, device_attr_name) -> None: