[CPU] Update custom ops for the CPU backend (#20255)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CPU] Update custom ops for the CPU backend (#20255)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
6cc1e7d9 · Li, Jiang · GitHub · 9909726d · 6cc1e7d9 · 6cc1e7d9
Unverified Commit 6cc1e7d9 authored Jul 01, 2025 by Li, Jiang Committed by GitHub Jul 01, 2025
3 changed files
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -63,7 +63,15 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
    return logits
-def rocm_unquantized_gemm(x: torch.Tensor,
+def default_unquantized_gemm(layer: torch.nn.Module,
+                             x: torch.Tensor,
+                             weight: torch.Tensor,
+                             bias: Optional[torch.Tensor] = None):
+    return torch.nn.functional.linear(x, weight, bias)
+def rocm_unquantized_gemm(layer: torch.nn.Module,
+                          x: torch.Tensor,
                          weight: torch.Tensor,
                          bias: Optional[torch.Tensor] = None):
    from vllm.platforms.rocm import on_gfx9
@@ -89,7 +97,20 @@ def rocm_unquantized_gemm(x: torch.Tensor,
    return torch.nn.functional.linear(x, weight, bias)
+def cpu_unquantized_gemm(layer: torch.nn.Module,
+                         x: torch.Tensor,
+                         weight: torch.Tensor,
+                         bias: Optional[torch.Tensor] = None):
+    if getattr(layer, "use_cpu_sgl", False):
+        return torch.ops._C.weight_packed_linear(x, weight, bias, True)
+    else:
+        return torch.nn.functional.linear(x, weight, bias)
 def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
    if current_platform.is_rocm():
        return rocm_unquantized_gemm
-    return torch.nn.functional.linear
+    elif current_platform.is_cpu():
+        return cpu_unquantized_gemm
+    else:
+        return default_unquantized_gemm
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -43,7 +43,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return dispatch_unquantized_gemm()(x, layer.weight, bias)
+        return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
    def embedding(self, layer: torch.nn.Module,
                  input_: torch.Tensor) -> torch.Tensor:

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -194,6 +194,8 @@ class CpuPlatform(Platform):
                "epilogue_fusion":
                True,
            })
+            if compilation_config.use_inductor:
+                compilation_config.custom_ops = ["none"]
        if vllm_config.lora_config is not None:
            compilation_config.level = CompilationLevel.NO_COMPILATION