Unverified Commit 6cc1e7d9 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[CPU] Update custom ops for the CPU backend (#20255)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
parent 9909726d
...@@ -63,7 +63,15 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, ...@@ -63,7 +63,15 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
return logits return logits
def rocm_unquantized_gemm(x: torch.Tensor, def default_unquantized_gemm(layer: torch.nn.Module,
x: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None):
return torch.nn.functional.linear(x, weight, bias)
def rocm_unquantized_gemm(layer: torch.nn.Module,
x: torch.Tensor,
weight: torch.Tensor, weight: torch.Tensor,
bias: Optional[torch.Tensor] = None): bias: Optional[torch.Tensor] = None):
from vllm.platforms.rocm import on_gfx9 from vllm.platforms.rocm import on_gfx9
...@@ -89,7 +97,20 @@ def rocm_unquantized_gemm(x: torch.Tensor, ...@@ -89,7 +97,20 @@ def rocm_unquantized_gemm(x: torch.Tensor,
return torch.nn.functional.linear(x, weight, bias) return torch.nn.functional.linear(x, weight, bias)
def cpu_unquantized_gemm(layer: torch.nn.Module,
x: torch.Tensor,
weight: torch.Tensor,
bias: Optional[torch.Tensor] = None):
if getattr(layer, "use_cpu_sgl", False):
return torch.ops._C.weight_packed_linear(x, weight, bias, True)
else:
return torch.nn.functional.linear(x, weight, bias)
def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]: def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
if current_platform.is_rocm(): if current_platform.is_rocm():
return rocm_unquantized_gemm return rocm_unquantized_gemm
return torch.nn.functional.linear elif current_platform.is_cpu():
return cpu_unquantized_gemm
else:
return default_unquantized_gemm
...@@ -43,7 +43,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): ...@@ -43,7 +43,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer: torch.nn.Module, layer: torch.nn.Module,
x: torch.Tensor, x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor: bias: Optional[torch.Tensor] = None) -> torch.Tensor:
return dispatch_unquantized_gemm()(x, layer.weight, bias) return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
def embedding(self, layer: torch.nn.Module, def embedding(self, layer: torch.nn.Module,
input_: torch.Tensor) -> torch.Tensor: input_: torch.Tensor) -> torch.Tensor:
......
...@@ -194,6 +194,8 @@ class CpuPlatform(Platform): ...@@ -194,6 +194,8 @@ class CpuPlatform(Platform):
"epilogue_fusion": "epilogue_fusion":
True, True,
}) })
if compilation_config.use_inductor:
compilation_config.custom_ops = ["none"]
if vllm_config.lora_config is not None: if vllm_config.lora_config is not None:
compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.level = CompilationLevel.NO_COMPILATION
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment