Unverified Commit b7170cc8 authored by Trevor Morris's avatar Trevor Morris Committed by GitHub
Browse files

[bugfix] Fix flashinfer cutlass EP moe after MoE refactor (#8630)

parent 5c14515f
...@@ -119,6 +119,7 @@ class FusedMoE(torch.nn.Module): ...@@ -119,6 +119,7 @@ class FusedMoE(torch.nn.Module):
* self.num_local_experts : (self.moe_ep_rank + 1) * self.num_local_experts : (self.moe_ep_rank + 1)
* self.num_local_experts * self.num_local_experts
] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu") ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
if not self.enable_flashinfer_cutlass_moe:
self.expert_map_gpu = self.expert_map_cpu.to(device="cuda") self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
self.routed_scaling_factor = routed_scaling_factor self.routed_scaling_factor = routed_scaling_factor
......
...@@ -437,6 +437,11 @@ class ServerArgs: ...@@ -437,6 +437,11 @@ class ServerArgs:
self.quantization == "modelopt_fp4" self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE" ), "modelopt_fp4 quantization is required for Flashinfer MOE"
os.environ["TRTLLM_ENABLE_PDL"] = "1" os.environ["TRTLLM_ENABLE_PDL"] = "1"
if self.enable_ep_moe:
self.ep_size = self.tp_size
logger.warning(
f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
if self.enable_flashinfer_trtllm_moe: if self.enable_flashinfer_trtllm_moe:
assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE" assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment