use custom_all_reduce when bs*hidden_size<=256*4096 (bw)

use custom_all_reduce when bshidden_size<=2564096 (bw)
d59f30d4 · zhuwenwen · fee048ff · d59f30d4
Commit d59f30d4 authored Jun 18, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce.py +2 -2

No files found.
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -54,7 +54,7 @@ class CustomAllreduce:
    def __init__(self,
                 group: ProcessGroup,
                 device: Union[int, str, torch.device],
-                 max_size=8192 * 1024) -> None:
+                 max_size=8192 * 512) -> None:
        """
        Args:
            group: the process group to work on. If None, it will use the
@@ -230,7 +230,7 @@ class CustomAllreduce:
            return False
        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
        # little performance improvement over NCCL.
-        return inp_size < self.max_size
+        return inp_size <= self.max_size

    def all_reduce(self,
                   inp: torch.Tensor,