Unverified Commit 179a619c authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Bugfix] Fix broken CPU quantization due to triton import (#15038)


Signed-off-by: default avatarIsotr0py <2037008807@qq.com>
parent 452e8fd9
...@@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter ...@@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
FusedMoEMethodBase) FusedMoEMethodBase)
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
...@@ -140,6 +139,10 @@ def _fused_moe_gguf( ...@@ -140,6 +139,10 @@ def _fused_moe_gguf(
qweight_type2: int, qweight_type2: int,
act, act,
) -> torch.Tensor: ) -> torch.Tensor:
# lazy import to avoid triggering triton import in CPU backend
from vllm.model_executor.layers.fused_moe.fused_moe import (
moe_align_block_size)
out_hidden_states = torch.empty_like(x) out_hidden_states = torch.empty_like(x)
if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES: if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
num_tokens, _ = x.shape num_tokens, _ = x.shape
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment