avoid fused_moe_triton `padding` circular import (#2624)

9254a33a · Xiaoyu Zhang · GitHub · 8a2681e2 · 9254a33a
Unverified Commit 9254a33a authored Dec 28, 2024 by Xiaoyu Zhang Committed by GitHub Dec 28, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

python/sglang/srt/layers/quantization/fp8.py python/sglang/srt/layers/quantization/fp8.py +4 -1

No files found.
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -28,7 +28,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter

 from sglang.srt.layers.linear import LinearMethodBase, UnquantizedLinearMethod
-from sglang.srt.layers.moe.fused_moe_triton.fused_moe import padding_size
 from sglang.srt.layers.quantization.base_config import (
    QuantizationConfig,
    QuantizeMethodBase,
@@ -548,6 +547,10 @@ class Fp8MoEMethod:
            layer.w2_input_scale = None

    def process_weights_after_loading(self, layer: Module) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+            padding_size,  # Avoid circular import
+        )
+
        # Block quant doesn't need to process weights after loading
        if self.block_quant:
            return