[Bugfix][CPU] Fallback oneDNN linear to torch linear to fix half gemm support...

[Bugfix][CPU] Fallback oneDNN linear to torch linear to fix half gemm support on legecy platforms (#27526) Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>

[Bugfix][CPU] Fallback oneDNN linear to torch linear to fix half gemm support...
[Bugfix][CPU] Fallback oneDNN linear to torch linear to fix half gemm support on legecy platforms (#27526) Signed-off-by: jiang1.li <jiang1.li@intel.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
d34f5fe9 · Li, Jiang · GitHub · bdb01a38 · d34f5fe9 · d34f5fe9
Unverified Commit d34f5fe9 authored Oct 28, 2025 by Li, Jiang Committed by GitHub Oct 27, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 10 deletions

docker/Dockerfile.cpu docker/Dockerfile.cpu +1 -1

vllm/model_executor/layers/utils.py vllm/model_executor/layers/utils.py +21 -9

No files found.
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -79,7 +79,7 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build
-ARG max_jobs=2
+ARG max_jobs=32
 ENV MAX_JOBS=${max_jobs}
 ARG GIT_REPO_CHECK=0

--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -8,9 +8,12 @@ import torch
 from vllm import _custom_ops as ops
 from vllm import envs
+from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
+logger = init_logger(__name__)
 def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
    # Shuffle weight along the last dimension so that
@@ -178,19 +181,28 @@ def dispatch_cpu_unquantized_gemm(
        )
        if remove_weight:
            layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+        return
    elif (
        ops._supports_onednn
        and current_platform.get_cpu_architecture() != CpuArchEnum.POWERPC
    ):
-        origin_weight = layer.weight
+        try:
-        if remove_weight:
+            origin_weight = layer.weight
-            layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
+            handler = ops.create_onednn_mm(origin_weight.t(), 32)
-        handler = ops.create_onednn_mm(origin_weight.t(), 32)
+            layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(handler, x, bias)
-        layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm(handler, x, bias)
+            if remove_weight:
-    else:
+                layer.weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
-        layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
+            return
-            x, weight, bias
+        except RuntimeError as e:
-        )
+            logger.warning_once(
+                "Failed to create oneDNN linear, fallback to torch linear."
+                f" Exception: {e}"
+            )
+    # fallback case
+    layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear(
+        x, weight, bias
+    )
 def cpu_unquantized_gemm(