[Bugfix] Size FlashInfer NVLink MNNVL workspace to EP group (#40893)

Signed-off-by: Dao Le <Dao007forever@gmail.com>

[Bugfix] Size FlashInfer NVLink MNNVL workspace to EP group (#40893)
Signed-off-by: Dao Le <Dao007forever@gmail.com>
9558f439 · Dao007forever · GitHub · 8cd174fa · 9558f439
Unverified Commit 9558f439 authored Apr 26, 2026 by Dao007forever Committed by GitHub Apr 26, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 7 deletions

vllm/distributed/device_communicators/all2all.py vllm/distributed/device_communicators/all2all.py +13 -7

No files found.
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -492,15 +492,18 @@ class FlashInferNVLinkTwoSidedManager(All2AllManagerBase):
            CustomCommunicator,
        )
-        dp_config = MnnvlConfig(
+        # MNNVL workspace is allocated per rank in the comm_backend's group; the
-            comm_backend=CustomCommunicator(get_dp_group().cpu_group),
+        # flashinfer kernel asserts workspace.size(0) == moe_ep_size, so the backend
+        # must span the EP group (= DP*PCP*TP), not the DP group.
+        ep_config = MnnvlConfig(
+            comm_backend=CustomCommunicator(self.cpu_group),
            fabric_page_size=1 << 29,  # 512MB
            allocation_granularity=0,  # Auto-detect
        )
-        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(self.mapping, dp_config)
+        self.workspace_tensor = MnnvlMoe.get_moe_workspaces(self.mapping, ep_config)
        self.prepare_workspace_tensor = MnnvlMoe.get_moe_prepare_workspace(
-            self.mapping, dp_config
+            self.mapping, ep_config
        )
        self.world_size = world_size
@@ -605,8 +608,11 @@ class FlashInferNVLinkOneSidedManager(All2AllManagerBase):
            CustomCommunicator,
        )
-        dp_config = MnnvlConfig(
+        # MNNVL workspace is allocated per rank in the comm_backend's group; the
-            comm_backend=CustomCommunicator(get_dp_group().cpu_group),
+        # flashinfer kernel asserts workspace.size(0) == moe_ep_size, so the backend
+        # must span the EP group (= DP*PCP*TP), not the DP group.
+        ep_config = MnnvlConfig(
+            comm_backend=CustomCommunicator(self.cpu_group),
        )
        total_dispatch_payload_size_per_token = (
            hidden_size // 2  # nvfp4 hidden states
@@ -628,7 +634,7 @@ class FlashInferNVLinkOneSidedManager(All2AllManagerBase):
            top_k=top_k,
            num_experts=num_experts,
            workspace_size_per_rank=self.workspace_size,
-            mnnvl_config=dp_config,
+            mnnvl_config=ep_config,
        )
        self.gpus_per_node = gpus_per_node