[bugfix] Fix flashinfer cutlass EP moe after MoE refactor (#8630)

b7170cc8 · Trevor Morris · GitHub · 5c14515f · b7170cc8 · b7170cc8
Unverified Commit b7170cc8 authored Jul 31, 2025 by Trevor Morris Committed by GitHub Jul 31, 2025
Showing with 7 additions and 1 deletion

python/sglang/srt/layers/moe/fused_moe_triton/layer.py python/sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -1

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +5 -0

No files found.
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -119,7 +119,8 @@ class FusedMoE(torch.nn.Module):
                * self.num_local_experts : (self.moe_ep_rank + 1)
                * self.num_local_experts
            ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
-            self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
+            if not self.enable_flashinfer_cutlass_moe:
+                self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")

        self.routed_scaling_factor = routed_scaling_factor
        assert intermediate_size % self.moe_tp_size == 0

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -437,6 +437,11 @@ class ServerArgs:
                self.quantization == "modelopt_fp4"
            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
            os.environ["TRTLLM_ENABLE_PDL"] = "1"
+            if self.enable_ep_moe:
+                self.ep_size = self.tp_size
+                logger.warning(
+                    f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+                )

        if self.enable_flashinfer_trtllm_moe:
            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"