Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
440f0e7d
Unverified
Commit
440f0e7d
authored
Mar 03, 2026
by
Li, Jiang
Committed by
GitHub
Mar 03, 2026
Browse files
[Bugfix] Avoid src/dst as None in irecv/isend_tensor_dict (#35754)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
fd4a90f3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
12 deletions
+30
-12
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
...ite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+21
-4
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+9
-8
No files found.
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
View file @
440f0e7d
#!/bin/bash
set
-euox
pipefail
export
VLLM_CPU_CI_ENV
=
0
echo
"--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-pp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
timeout
600 bash
-c
"until curl localhost:8000/v1/models
> /dev/null 2>&1
; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--result-dir
./test_results
\
--result-filename
tp_pp.json
\
--save-result
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
failed_req
=
$(
jq
'.failed'
./test_results/tp_pp.json
)
if
[
"
$failed_req
"
-ne
0
]
;
then
echo
"Some requests were failed!"
exit
1
fi
echo
"--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-dp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
timeout
600 bash
-c
"until curl localhost:8000/v1/models
> /dev/null 2>&1
; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--result-dir
./test_results
\
--result-filename
dp_pp.json
\
--save-result
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
failed_req
=
$(
jq
'.failed'
./test_results/dp_pp.json
)
if
[
"
$failed_req
"
-ne
0
]
;
then
echo
"Some requests were failed!"
exit
1
fi
vllm/distributed/parallel_state.py
View file @
440f0e7d
...
...
@@ -851,6 +851,10 @@ class GroupCoordinator:
if
self
.
world_size
<=
1
:
return
[]
if
dst
is
None
:
dst
=
(
self
.
rank_in_group
+
1
)
%
self
.
world_size
assert
dst
<
self
.
world_size
,
f
"Invalid dst rank (
{
dst
}
)"
if
self
.
use_cpu_custom_send_recv
:
if
self
.
device_communicator
is
None
:
raise
ValueError
(
"No device communicator found"
)
...
...
@@ -868,10 +872,6 @@ class GroupCoordinator:
group
=
self
.
device_group
metadata_group
=
self
.
cpu_group
if
dst
is
None
:
dst
=
(
self
.
rank_in_group
+
1
)
%
self
.
world_size
assert
dst
<
self
.
world_size
,
f
"Invalid dst rank (
{
dst
}
)"
metadata_list
,
tensor_list
=
_split_tensor_dict
(
tensor_dict
)
self
.
send_object
(
metadata_list
,
dst
=
dst
)
...
...
@@ -948,6 +948,11 @@ class GroupCoordinator:
]:
if
not
torch
.
distributed
.
is_initialized
()
or
self
.
world_size
==
1
:
return
None
,
[],
[]
if
src
is
None
:
src
=
(
self
.
rank_in_group
-
1
)
%
self
.
world_size
assert
src
<
self
.
world_size
,
f
"Invalid src rank (
{
src
}
)"
if
self
.
use_cpu_custom_send_recv
:
if
self
.
device_communicator
is
None
:
raise
ValueError
(
"No device communicator found"
)
...
...
@@ -965,10 +970,6 @@ class GroupCoordinator:
group
=
self
.
device_group
metadata_group
=
self
.
cpu_group
if
src
is
None
:
src
=
(
self
.
rank_in_group
-
1
)
%
self
.
world_size
assert
src
<
self
.
world_size
,
f
"Invalid src rank (
{
src
}
)"
recv_metadata_list
=
self
.
recv_object
(
src
=
src
)
tensor_dict
:
dict
[
str
,
Any
]
=
{}
handles
:
list
[
Handle
]
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment