Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d1c3d7d1
Unverified
Commit
d1c3d7d1
authored
Jun 14, 2024
by
youkaichao
Committed by
GitHub
Jun 14, 2024
Browse files
[misc][distributed] fix benign error in `is_in_the_same_node` (#5512)
parent
77490c6f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
10 deletions
+10
-10
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+10
-10
No files found.
vllm/distributed/parallel_state.py
View file @
d1c3d7d1
...
...
@@ -23,8 +23,9 @@ import contextlib
from
collections
import
namedtuple
from
contextlib
import
contextmanager
,
nullcontext
from
dataclasses
import
dataclass
from
multiprocessing
import
resource_tracker
,
shared_memory
from
multiprocessing
import
shared_memory
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
unittest.mock
import
patch
import
torch
from
torch.distributed
import
Backend
,
ProcessGroup
...
...
@@ -744,6 +745,11 @@ def is_in_the_same_node(pg: ProcessGroup):
src
=
ranks
[
0
],
group
=
pg
)
name
=
recv
[
0
]
# fix to https://stackoverflow.com/q/62748654/9191338
# Python incorrectly tracks shared memory even if it is not
# created by the process. The following patch is a workaround.
with
patch
(
"multiprocessing.resource_tracker.register"
,
lambda
*
args
,
**
kwargs
:
None
):
shm
=
shared_memory
.
SharedMemory
(
name
=
name
)
if
shm
.
buf
[:
len
(
magic_message
)]
==
magic_message
:
is_in_the_same_node
[
rank
]
=
1
...
...
@@ -757,14 +763,8 @@ def is_in_the_same_node(pg: ProcessGroup):
# clean up the shared memory segment
with
contextlib
.
suppress
(
OSError
):
if
rank
==
0
:
if
shm
:
if
rank
==
0
and
shm
:
shm
.
unlink
()
else
:
if
shm
:
# fix to https://stackoverflow.com/q/62748654/9191338
resource_tracker
.
unregister
(
shm
.
_name
,
"shared_memory"
)
# type: ignore[attr-defined]
torch
.
distributed
.
all_reduce
(
is_in_the_same_node
,
group
=
pg
)
return
is_in_the_same_node
.
sum
().
item
()
==
world_size
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment