Fix: #3988 using blockwise_int8 (#4023)

12f2e6c3 · Xihuai Wang · GitHub · 95575aa7 · 12f2e6c3
Unverified Commit 12f2e6c3 authored Mar 04, 2025 by Xihuai Wang Committed by GitHub Mar 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

python/sglang/srt/layers/quantization/blockwise_int8.py python/sglang/srt/layers/quantization/blockwise_int8.py +4 -1

No files found.
--- a/python/sglang/srt/layers/quantization/blockwise_int8.py
+++ b/python/sglang/srt/layers/quantization/blockwise_int8.py
@@ -371,6 +371,8 @@ class BlockInt8MoEMethod:
        custom_routing_function: Optional[Callable] = None,
        correction_bias: Optional[torch.Tensor] = None,
        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
    ) -> torch.Tensor:
        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
        from sglang.srt.layers.moe.topk import select_experts
@@ -395,7 +397,7 @@ class BlockInt8MoEMethod:
            layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            inplace=True,
+            inplace=inplace,
            activation=activation,
            use_int8_w8a8=True,
            w1_scale=(layer.w13_weight_scale_inv),
@@ -403,4 +405,5 @@ class BlockInt8MoEMethod:
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            block_shape=self.quant_config.weight_block_size,
+            no_combine=no_combine,
        )