add custom allreduce check

c8a63b38 · xiabo · a8c92908 · c8a63b38 · c8a63b38
Commit c8a63b38 authored Mar 25, 2025 by xiabo
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 6 deletions

vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce.py +4 -6

vllm/platforms/rocm.py vllm/platforms/rocm.py +28 -0

No files found.
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -132,12 +132,10 @@ class CustomAllreduce:
        # test nvlink first, this will filter out most of the cases
        # where custom allreduce is not supported
        # this checks hardware and driver support for NVLink
-        # xiabo
-        # assert current_platform.is_cuda()
-        # from vllm.platforms.cuda import CudaPlatform
-        # cuda_platform: CudaPlatform = current_platform
-        # full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
-        full_nvlink = True
+
+        assert current_platform.is_cuda_alike()
+        full_nvlink = current_platform.is_fully_connected_nvlink_or_xgmi(
+            physical_device_ids)
        if world_size > 2 and not full_nvlink:
            logger.warning(
                "Custom allreduce is disabled because it's not supported on"

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -18,6 +18,10 @@ else:

 logger = init_logger(__name__)

+from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
+
 try:
    import vllm._C  # noqa: F401
 except ImportError as e:
@@ -104,6 +108,30 @@ class RocmPlatform(Platform):
    def get_device_name(cls, device_id: int = 0) -> str:
        return torch.cuda.get_device_name(device_id)

+    @staticmethod
+    def is_fully_connected_nvlink_or_xgmi(
+            physical_device_ids: List[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        device_props = torch.cuda.get_device_properties(device_id)