[Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>

[Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
8a57872b · Tyler Michael Smith · GitHub · 5bc1ad6c · 8a57872b · 8a57872b
Unverified Commit 8a57872b authored Jun 02, 2025 by Tyler Michael Smith Committed by GitHub Jun 03, 2025
Showing with 9 additions and 1 deletion

vllm/distributed/device_communicators/all2all.py vllm/distributed/device_communicators/all2all.py +4 -0

vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/layer.py +5 -1

No files found.
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -83,6 +83,10 @@ class PPLXAll2AllManager(All2AllManagerBase):
        assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
        super().__init__(cpu_group)

+        # TODO(tms): Disable pplx-a2a intranode as it fails with the error:
+        # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
+        self.internode = True
+
        if self.internode:
            # inter-node communication needs nvshmem,
            # intra-node communication uses p2p mapping directly

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -269,9 +269,13 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                    torch.float32.itemsize)),
-                group_name=all2all_manager.cpu_group.group_name,
            )

+            # Intranode pplx a2a takes a group name while internode does not.
+            if not all2all_manager.internode:
+                all_to_all_args[
+                    "group_name"] = all2all_manager.cpu_group.group_name
+
            handle = all2all_manager.get_handle(all_to_all_args)

            prepare_finalize = PplxPrepareAndFinalize(