区分pcie和hglink custom allreduce的使用

1b5aa25e · xiabo · c6a9b490 · 1b5aa25e · 1b5aa25e
Commit 1b5aa25e authored Jan 04, 2026 by xiabo
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce.py +6 -3

vllm/envs.py vllm/envs.py +1 -1

No files found.
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -264,10 +264,13 @@ class CustomAllreduce:
            return None
        if self._IS_CAPTURING:
            if torch.cuda.is_current_stream_capturing():
-                if not envs.VLLM_CUSTOM_CACHE:
-                    return self.all_reduce(input, registered=False)
-                else:
+                if envs.VLLM_CUSTOM_CACHE:
                    return self.all_reduce(input, registered=True)
+                else:
+                    if not self.fully_connected:
+                        return self.all_reduce(input, registered=False)
+                    else:
+                        return self.all_reduce(input, registered=True)
            else:
                # If warm up, mimic the allocation pattern since custom
                # allreduce is out-of-place.

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1080,7 +1080,7 @@ environment_variables: dict[str, Callable[[], Any]] = {

    # flag to control vllm to use optimized kernels
    "VLLM_CUSTOM_CACHE":
-    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "1"))),
+    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
    
    # flag to control vllm to use optimized kernels
    "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":