Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2aab9acf
Unverified
Commit
2aab9acf
authored
Apr 20, 2026
by
Fadi Arafeh
Committed by
GitHub
Apr 20, 2026
Browse files
[CPU][BugFix] Fix inter-node pipeline parallel (#40150)
Signed-off-by:
Fadi Arafeh
<
fadi.arafeh@arm.com
>
parent
58631d7c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
2 deletions
+17
-2
vllm/distributed/device_communicators/cpu_communicator.py
vllm/distributed/device_communicators/cpu_communicator.py
+13
-0
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+4
-2
No files found.
vllm/distributed/device_communicators/cpu_communicator.py
View file @
2aab9acf
...
@@ -45,6 +45,9 @@ class CpuCommunicator(DeviceCommunicatorBase):
...
@@ -45,6 +45,9 @@ class CpuCommunicator(DeviceCommunicatorBase):
unique_name
,
unique_name
,
)
)
# send/recv tensor_dict is only supported through the SHM communicator backend
self
.
supports_tensor_dict
=
isinstance
(
self
.
dist_module
,
_CPUSHMDistributed
)
if
self
.
use_all2all
:
if
self
.
use_all2all
:
if
self
.
all2all_backend
!=
"naive"
:
# type: ignore[has-type]
if
self
.
all2all_backend
!=
"naive"
:
# type: ignore[has-type]
logger
.
warning
(
logger
.
warning
(
...
@@ -143,12 +146,22 @@ class CpuCommunicator(DeviceCommunicatorBase):
...
@@ -143,12 +146,22 @@ class CpuCommunicator(DeviceCommunicatorBase):
tensor_dict
:
dict
[
str
,
torch
.
Tensor
|
Any
],
tensor_dict
:
dict
[
str
,
torch
.
Tensor
|
Any
],
dst
:
int
,
dst
:
int
,
)
->
None
:
)
->
None
:
if
not
self
.
supports_tensor_dict
:
raise
NotImplementedError
(
"CpuCommunicator does not support tensor dict fastpath with "
"torch.distributed backend."
)
return
self
.
dist_module
.
send_tensor_dict
(
tensor_dict
,
dst
)
return
self
.
dist_module
.
send_tensor_dict
(
tensor_dict
,
dst
)
def
recv_tensor_dict
(
def
recv_tensor_dict
(
self
,
self
,
src
:
int
,
src
:
int
,
)
->
dict
[
str
,
torch
.
Tensor
|
Any
]:
)
->
dict
[
str
,
torch
.
Tensor
|
Any
]:
if
not
self
.
supports_tensor_dict
:
raise
NotImplementedError
(
"CpuCommunicator does not support tensor dict fastpath with "
"torch.distributed backend."
)
return
self
.
dist_module
.
recv_tensor_dict
(
src
)
return
self
.
dist_module
.
recv_tensor_dict
(
src
)
def
dispatch_router_logits
(
def
dispatch_router_logits
(
...
...
vllm/distributed/parallel_state.py
View file @
2aab9acf
...
@@ -394,8 +394,10 @@ class GroupCoordinator:
...
@@ -394,8 +394,10 @@ class GroupCoordinator:
current_platform
.
is_tpu
()
or
current_platform
.
use_custom_op_collectives
()
current_platform
.
is_tpu
()
or
current_platform
.
use_custom_op_collectives
()
)
)
self
.
use_cpu_custom_send_recv
=
current_platform
.
is_cpu
()
and
hasattr
(
self
.
use_cpu_custom_send_recv
=
(
torch
.
ops
.
_C
,
"init_shm_manager"
current_platform
.
is_cpu
()
and
self
.
device_communicator
and
getattr
(
self
.
device_communicator
,
"supports_tensor_dict"
,
False
)
)
)
def
create_mq_broadcaster
(
def
create_mq_broadcaster
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment