Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c8a63b38
Commit
c8a63b38
authored
Mar 25, 2025
by
xiabo
Browse files
add custom allreduce check
parent
a8c92908
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
6 deletions
+32
-6
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+4
-6
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+28
-0
No files found.
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
c8a63b38
...
...
@@ -132,12 +132,10 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported
# this checks hardware and driver support for NVLink
# xiabo
# assert current_platform.is_cuda()
# from vllm.platforms.cuda import CudaPlatform
# cuda_platform: CudaPlatform = current_platform
# full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
full_nvlink
=
True
assert
current_platform
.
is_cuda_alike
()
full_nvlink
=
current_platform
.
is_fully_connected_nvlink_or_xgmi
(
physical_device_ids
)
if
world_size
>
2
and
not
full_nvlink
:
logger
.
warning
(
"Custom allreduce is disabled because it's not supported on"
...
...
vllm/platforms/rocm.py
View file @
c8a63b38
...
...
@@ -18,6 +18,10 @@ else:
logger
=
init_logger
(
__name__
)
from
amdsmi
import
(
AmdSmiException
,
amdsmi_get_gpu_asic_info
,
amdsmi_get_processor_handles
,
amdsmi_init
,
amdsmi_shut_down
,
amdsmi_topo_get_link_type
)
try
:
import
vllm._C
# noqa: F401
except
ImportError
as
e
:
...
...
@@ -104,6 +108,30 @@ class RocmPlatform(Platform):
def
get_device_name
(
cls
,
device_id
:
int
=
0
)
->
str
:
return
torch
.
cuda
.
get_device_name
(
device_id
)
@
staticmethod
def
is_fully_connected_nvlink_or_xgmi
(
physical_device_ids
:
List
[
int
])
->
bool
:
"""
Query if the set of gpus are fully connected by xgmi (1 hop)
"""
handles
=
[
amdsmi_get_processor_handles
()[
i
]
for
i
in
physical_device_ids
]
for
i
,
handle
in
enumerate
(
handles
):
for
j
,
peer_handle
in
enumerate
(
handles
):
if
i
<
j
:
try
:
link_type
=
amdsmi_topo_get_link_type
(
handle
,
peer_handle
)
# type is 2 for XGMI
if
link_type
[
"hops"
]
!=
1
or
link_type
[
"type"
]
!=
2
:
return
False
except
AmdSmiException
as
error
:
logger
.
error
(
"AMD 1 hop XGMI detection failed."
,
exc_info
=
error
)
return
False
return
True
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
device_props
=
torch
.
cuda
.
get_device_properties
(
device_id
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment