[Bugfix] Fix broken CPU quantization due to triton import (#15038)

Signed-off-by: Isotr0py <2037008807@qq.com>

[Bugfix] Fix broken CPU quantization due to triton import (#15038)
Signed-off-by: Isotr0py <2037008807@qq.com>
179a619c · Isotr0py · GitHub · 452e8fd9 · 179a619c
Unverified Commit 179a619c authored Mar 18, 2025 by Isotr0py Committed by GitHub Mar 18, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/model_executor/layers/quantization/gguf.py vllm/model_executor/layers/quantization/gguf.py +4 -1

No files found.
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -10,7 +10,6 @@ from torch.nn.parameter import Parameter, UninitializedParameter
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                        FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -140,6 +139,10 @@ def _fused_moe_gguf(
    qweight_type2: int,
    act,
 ) -> torch.Tensor:
+    # lazy import to avoid triggering triton import in CPU backend
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        moe_align_block_size)
    out_hidden_states = torch.empty_like(x)
    if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
        num_tokens, _ = x.shape