Commit c8a63b38 authored by xiabo's avatar xiabo
Browse files

add custom allreduce check

parent a8c92908
......@@ -132,12 +132,10 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported
# this checks hardware and driver support for NVLink
# xiabo
# assert current_platform.is_cuda()
# from vllm.platforms.cuda import CudaPlatform
# cuda_platform: CudaPlatform = current_platform
# full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
full_nvlink = True
assert current_platform.is_cuda_alike()
full_nvlink = current_platform.is_fully_connected_nvlink_or_xgmi(
physical_device_ids)
if world_size > 2 and not full_nvlink:
logger.warning(
"Custom allreduce is disabled because it's not supported on"
......
......@@ -18,6 +18,10 @@ else:
logger = init_logger(__name__)
from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
amdsmi_get_processor_handles, amdsmi_init,
amdsmi_shut_down, amdsmi_topo_get_link_type)
try:
import vllm._C # noqa: F401
except ImportError as e:
......@@ -104,6 +108,30 @@ class RocmPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str:
return torch.cuda.get_device_name(device_id)
@staticmethod
def is_fully_connected_nvlink_or_xgmi(
physical_device_ids: List[int]) -> bool:
"""
Query if the set of gpus are fully connected by xgmi (1 hop)
"""
handles = [
amdsmi_get_processor_handles()[i] for i in physical_device_ids
]
for i, handle in enumerate(handles):
for j, peer_handle in enumerate(handles):
if i < j:
try:
link_type = amdsmi_topo_get_link_type(
handle, peer_handle)
# type is 2 for XGMI
if link_type["hops"] != 1 or link_type["type"] != 2:
return False
except AmdSmiException as error:
logger.error("AMD 1 hop XGMI detection failed.",
exc_info=error)
return False
return True
@classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.cuda.get_device_properties(device_id)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment