Fix FP4 MoE accuracy from missing routed_scaling_factor (#8333)

58c468f4 · Trevor Morris · GitHub · f8ca2368 · 58c468f4 · 58c468f4
Unverified Commit 58c468f4 authored Jul 25, 2025 by Trevor Morris Committed by GitHub Jul 25, 2025
Showing with 8 additions and 8 deletions

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +8 -4

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +0 -4

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -952,7 +952,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
        tp_rank: Optional[int] = None,
        tp_size: Optional[int] = None,
    ) -> torch.Tensor:
        assert activation == "silu", "Only SiLU activation is supported."
        if self.enable_flashinfer_moe:
@@ -982,13 +981,15 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
                tp_size=tp_size,
                tp_rank=tp_rank,
                tune_max_num_tokens=next_power_of_2(x.shape[0]),
-            )
+            )[0]
-            return output[0]
+            if routed_scaling_factor is not None:
+                output *= routed_scaling_factor
+            return output
        from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
        topk_weights, topk_ids, _ = topk_output
-        return cutlass_moe_fp4(
+        output = cutlass_moe_fp4(
            a=x,
            a1_gscale=layer.w13_input_scale_quant,
            w1_fp4=layer.w13_weight,
@@ -1003,3 +1004,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
            params=layer.cutlass_moe_params,
            apply_router_weight_on_input=apply_router_weight_on_input,
        ).to(x.dtype)
+        if routed_scaling_factor is not None:
+            output *= routed_scaling_factor
+        return output
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -433,10 +433,6 @@ class ServerArgs:
                self.quantization == "modelopt_fp4"
            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
            os.environ["TRTLLM_ENABLE_PDL"] = "1"
-            self.disable_shared_experts_fusion = True
-            logger.warning(
-                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
-            )
        # DeepEP MoE
        if self.enable_deepep_moe: