Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c391e4b6
Unverified
Commit
c391e4b6
authored
Apr 04, 2024
by
youkaichao
Committed by
GitHub
Apr 04, 2024
Browse files
[Core] improve robustness of pynccl (#3860)
parent
9117f892
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
8 deletions
+13
-8
vllm/model_executor/parallel_utils/pynccl.py
vllm/model_executor/parallel_utils/pynccl.py
+13
-8
No files found.
vllm/model_executor/parallel_utils/pynccl.py
View file @
c391e4b6
...
@@ -236,22 +236,25 @@ class NCCLCommunicator:
...
@@ -236,22 +236,25 @@ class NCCLCommunicator:
if
local_rank
==
-
1
:
if
local_rank
==
-
1
:
local_rank
=
self
.
rank
local_rank
=
self
.
rank
self
.
local_rank
=
local_rank
self
.
local_rank
=
local_rank
torch
.
cuda
.
set_device
(
local_rank
)
# don't use these args, as they can be -1
if
rank
==
0
:
# use `self.rank`, `self.local_rank` and `self.world_size` instead
del
world_size
,
rank
,
local_rank
torch
.
cuda
.
set_device
(
self
.
local_rank
)
if
self
.
rank
==
0
:
self
.
unique_id
=
ncclGetUniqueId
()
self
.
unique_id
=
ncclGetUniqueId
()
else
:
else
:
self
.
unique_id
=
NcclUniqueId
()
self
.
unique_id
=
NcclUniqueId
()
tensor
=
torch
.
ByteTensor
(
list
(
tensor
=
torch
.
ByteTensor
(
list
(
self
.
unique_id
.
internal
)).
cuda
(
self
.
unique_id
.
internal
)).
cuda
(
local_rank
)
self
.
local_rank
)
dist
.
broadcast
(
tensor
,
src
=
0
)
dist
.
broadcast
(
tensor
,
src
=
0
)
byte_list
=
tensor
.
cpu
().
tolist
()
byte_list
=
tensor
.
cpu
().
tolist
()
for
i
,
byte
in
enumerate
(
byte_list
):
for
i
,
byte
in
enumerate
(
byte_list
):
self
.
unique_id
.
internal
[
i
]
=
byte
self
.
unique_id
.
internal
[
i
]
=
byte
self
.
comm
=
ctypes
.
c_void_p
()
self
.
comm
=
ctypes
.
c_void_p
()
result
=
_c_ncclCommInitRank
(
ctypes
.
byref
(
self
.
comm
),
world_size
,
result
=
_c_ncclCommInitRank
(
ctypes
.
byref
(
self
.
comm
),
self
.
world_size
,
self
.
unique_id
,
rank
)
self
.
unique_id
,
self
.
rank
)
assert
result
==
0
assert
result
==
0
self
.
stream
=
torch
.
cuda
.
Stream
(
device
=
f
"cuda:
{
local_rank
}
"
)
self
.
stream
=
torch
.
cuda
.
Stream
(
device
=
f
"cuda:
{
self
.
local_rank
}
"
)
def
all_reduce
(
self
,
def
all_reduce
(
self
,
tensor
:
torch
.
Tensor
,
tensor
:
torch
.
Tensor
,
...
@@ -271,4 +274,6 @@ class NCCLCommunicator:
...
@@ -271,4 +274,6 @@ class NCCLCommunicator:
# `dist` module might have been already destroyed
# `dist` module might have been already destroyed
if
hasattr
(
dist
,
'destroy_process_group'
):
if
hasattr
(
dist
,
'destroy_process_group'
):
dist
.
destroy_process_group
()
dist
.
destroy_process_group
()
# function might have been already destroyed
if
_c_ncclCommDestroy
is
not
None
:
_c_ncclCommDestroy
(
self
.
comm
)
_c_ncclCommDestroy
(
self
.
comm
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment