Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
16a1cc9b
Unverified
Commit
16a1cc9b
authored
Aug 04, 2024
by
youkaichao
Committed by
GitHub
Aug 04, 2024
Browse files
[misc][distributed] improve libcudart.so finding (#7127)
parent
b1c9aa3d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
23 deletions
+25
-23
vllm/distributed/device_communicators/cuda_wrapper.py
vllm/distributed/device_communicators/cuda_wrapper.py
+22
-22
vllm/distributed/device_communicators/custom_all_reduce_utils.py
...stributed/device_communicators/custom_all_reduce_utils.py
+3
-1
No files found.
vllm/distributed/device_communicators/cuda_wrapper.py
View file @
16a1cc9b
...
...
@@ -4,9 +4,6 @@ convenient for use when we just need to call a few functions.
"""
import
ctypes
import
glob
import
os
import
sys
from
dataclasses
import
dataclass
from
typing
import
Any
,
Dict
,
List
,
Optional
...
...
@@ -36,24 +33,25 @@ class Function:
argtypes
:
List
[
Any
]
def
get_pytorch_default_cudart_library_path
()
->
str
:
# code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
lib_folder
=
"cuda_runtime"
lib_name
=
"libcudart.so.*[0-9]"
lib_path
=
None
for
path
in
sys
.
path
:
nvidia_path
=
os
.
path
.
join
(
path
,
"nvidia"
)
if
not
os
.
path
.
exists
(
nvidia_path
):
continue
candidate_lib_paths
=
glob
.
glob
(
os
.
path
.
join
(
nvidia_path
,
lib_folder
,
"lib"
,
lib_name
))
if
candidate_lib_paths
and
not
lib_path
:
lib_path
=
candidate_lib_paths
[
0
]
if
lib_path
:
def
find_loaded_library
(
lib_name
)
->
Optional
[
str
]:
"""
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
the file `/proc/self/maps` contains the memory maps of the process, which includes the
shared libraries loaded by the process. We can use this file to find the path of the
a loaded library.
"""
# noqa
found
=
False
with
open
(
"/proc/self/maps"
)
as
f
:
for
line
in
f
:
if
lib_name
in
line
:
found
=
True
break
if
not
lib_path
:
raise
ValueError
(
f
"
{
lib_name
}
not found in the system path
{
sys
.
path
}
"
)
return
lib_path
if
not
found
:
# the library is not loaded in the current process
return
None
start
=
line
.
index
(
"/"
)
path
=
line
[
start
:].
strip
()
return
path
class
CudaRTLibrary
:
...
...
@@ -100,7 +98,9 @@ class CudaRTLibrary:
def
__init__
(
self
,
so_file
:
Optional
[
str
]
=
None
):
if
so_file
is
None
:
so_file
=
get_pytorch_default_cudart_library_path
()
so_file
=
find_loaded_library
(
"libcudart.so"
)
assert
so_file
is
not
None
,
\
"libcudart.so is not loaded in the current process"
if
so_file
not
in
CudaRTLibrary
.
path_to_library_cache
:
lib
=
ctypes
.
CDLL
(
so_file
)
CudaRTLibrary
.
path_to_library_cache
[
so_file
]
=
lib
...
...
vllm/distributed/device_communicators/custom_all_reduce_utils.py
View file @
16a1cc9b
...
...
@@ -145,6 +145,7 @@ def can_actually_p2p(
p_tgt
.
start
()
p_src
.
join
()
p_tgt
.
join
()
assert
p_src
.
exitcode
==
0
and
p_tgt
.
exitcode
==
0
result
:
List
[
bool
]
=
[]
for
src
,
tgt
in
zip
(
batch_src
,
batch_tgt
):
a
=
result_queue
.
get
()
...
...
@@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
# wrap raised exception to provide more information
raise
RuntimeError
(
f
"Error happened when batch testing "
f
"peer-to-peer access from
{
batch_src
}
to
{
batch_tgt
}
"
)
from
e
f
"peer-to-peer access from
{
batch_src
}
to
{
batch_tgt
}
:
\n
"
f
"
{
returned
.
stderr
.
decode
()
}
"
)
from
e
result
=
pickle
.
loads
(
returned
.
stdout
)
for
_i
,
_j
,
r
in
zip
(
batch_src
,
batch_tgt
,
result
):
cache
[
f
"
{
_i
}
->
{
_j
}
"
]
=
r
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment