"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "4d2a31ab64dec56d136c1c6ded57eaa65c3af25e"
Commit c8a63b38 authored by xiabo's avatar xiabo
Browse files

add custom allreduce check

parent a8c92908
...@@ -132,12 +132,10 @@ class CustomAllreduce: ...@@ -132,12 +132,10 @@ class CustomAllreduce:
# test nvlink first, this will filter out most of the cases # test nvlink first, this will filter out most of the cases
# where custom allreduce is not supported # where custom allreduce is not supported
# this checks hardware and driver support for NVLink # this checks hardware and driver support for NVLink
# xiabo
# assert current_platform.is_cuda() assert current_platform.is_cuda_alike()
# from vllm.platforms.cuda import CudaPlatform full_nvlink = current_platform.is_fully_connected_nvlink_or_xgmi(
# cuda_platform: CudaPlatform = current_platform physical_device_ids)
# full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
full_nvlink = True
if world_size > 2 and not full_nvlink: if world_size > 2 and not full_nvlink:
logger.warning( logger.warning(
"Custom allreduce is disabled because it's not supported on" "Custom allreduce is disabled because it's not supported on"
......
...@@ -18,6 +18,10 @@ else: ...@@ -18,6 +18,10 @@ else:
logger = init_logger(__name__) logger = init_logger(__name__)
from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
amdsmi_get_processor_handles, amdsmi_init,
amdsmi_shut_down, amdsmi_topo_get_link_type)
try: try:
import vllm._C # noqa: F401 import vllm._C # noqa: F401
except ImportError as e: except ImportError as e:
...@@ -104,6 +108,30 @@ class RocmPlatform(Platform): ...@@ -104,6 +108,30 @@ class RocmPlatform(Platform):
def get_device_name(cls, device_id: int = 0) -> str: def get_device_name(cls, device_id: int = 0) -> str:
return torch.cuda.get_device_name(device_id) return torch.cuda.get_device_name(device_id)
@staticmethod
def is_fully_connected_nvlink_or_xgmi(
physical_device_ids: List[int]) -> bool:
"""
Query if the set of gpus are fully connected by xgmi (1 hop)
"""
handles = [
amdsmi_get_processor_handles()[i] for i in physical_device_ids
]
for i, handle in enumerate(handles):
for j, peer_handle in enumerate(handles):
if i < j:
try:
link_type = amdsmi_topo_get_link_type(
handle, peer_handle)
# type is 2 for XGMI
if link_type["hops"] != 1 or link_type["type"] != 2:
return False
except AmdSmiException as error:
logger.error("AMD 1 hop XGMI detection failed.",
exc_info=error)
return False
return True
@classmethod @classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
device_props = torch.cuda.get_device_properties(device_id) device_props = torch.cuda.get_device_properties(device_id)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment