[NVIDIA]Fix local_num_experts for EP (#8779)

b01eeb80 · Shu Wang · GitHub · 1ea94d3b · b01eeb80 · b01eeb80
Unverified Commit b01eeb80 authored Aug 05, 2025 by Shu Wang Committed by GitHub Aug 04, 2025
2 changed files
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -200,7 +200,8 @@ class FusedMoE(torch.nn.Module):
        self.quant_config = quant_config
        self.quant_method.create_weights(
            layer=self,
-            num_experts=self.num_local_experts,
+            num_experts=self.num_experts,
+            num_local_experts=self.num_local_experts,
            hidden_size=hidden_size,
            # FIXME: figure out which intermediate_size to use
            intermediate_size=self.intermediate_size_per_partition,

--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -752,6 +752,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
        self,
        layer: torch.nn.Module,
        num_experts: int,
+        num_local_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
@@ -765,7 +766,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):

        # TODO(ch-wan): check if this is needed
        layer.num_experts = num_experts
-        layer.num_local_experts = num_experts
+        layer.num_local_experts = num_local_experts
        layer.intermediate_size_per_partition = intermediate_size_per_partition
        layer.params_dtype = params_dtype
        layer.quant_config = self.quant_config