Fix MTP MoE weight loading with NVFP4 target model. (#10758)

9c53dad8 · Jue WANG · GitHub · 7ca1bea6 · 9c53dad8
Unverified Commit 9c53dad8 authored Sep 22, 2025 by Jue WANG Committed by GitHub Sep 22, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

python/sglang/srt/layers/moe/fused_moe_triton/layer.py python/sglang/srt/layers/moe/fused_moe_triton/layer.py +4 -1

No files found.
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -575,7 +575,10 @@ class FusedMoE(torch.nn.Module):
            )

        # Flashinfer assumes w31 format for w13_weight. Same for the scales.
-        if should_use_flashinfer_trtllm_moe():
+        if (
+            should_use_flashinfer_trtllm_moe()
+            and self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
+        ):
            shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]

        WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]