解决custom cudagraph模式需要拷贝的问题，需要配合dtk进行使用。

区分pcie和hglink custom allreduce的使用 vllm：export VLLM_CUSTOM_CACHE=1 dtk：export HIP_KERNEL_EVENT_SYSTENFENCE=1

解决custom cudagraph模式需要拷贝的问题，需要配合dtk进行使用。
区分pcie和hglink custom allreduce的使用 vllm：export VLLM_CUSTOM_CACHE=1 dtk：export HIP_KERNEL_EVENT_SYSTENFENCE=1
b93c3473 · zhuwenwen · 58de8cd6 · b93c3473 · b93c3473 · b93c3473
Commit b93c3473 authored Jan 10, 2026 by zhuwenwen
Showing with 30 additions and 3 deletions

csrc/custom_all_reduce.cuh csrc/custom_all_reduce.cuh +18 -2

vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce.py +7 -1

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -490,6 +490,7 @@ class CustomAllreduce {
  std::map<IPC_KEY, char*> ipc_handles_;
  uint32_t** dev_curr_hdp_reg;
+  hipEvent_t stopEvent;
  /**
   * Signals are an array of ipc-enabled buffers from all ranks.
   * For each of the buffer, the layout is as follows:
@@ -518,6 +519,7 @@ class CustomAllreduce {
        hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i);
      }
    }
+    cudaEventCreate(&stopEvent);
  }
  char* open_ipc_handle(const void* ipc_handle) {
@@ -739,9 +741,22 @@ class CustomAllreduce {
      }
    }
+// #define KL(ngpus, name)                                                       \
+//   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+//                                                  rank_, size);
 #define KL(ngpus, name)                                                       \
-  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+  {                                                                           \
-                                                 rank_, size);
+    void* kernelArgs[] = {                                                    \
+      &ptrs, &sg_, &self_sg_, &output, &rank_, &size                          \
+    };                                                                        \
+    hipExtLaunchKernel(                                                       \
+      (void*)name<T, ngpus>,                                                  \
+      blocks, threads,                                                        \
+      kernelArgs, 0,                                                          \
+      stream, nullptr, stopEvent, 0                                           \
+    );                                                                        \
+  }
 #define REDUCE_CASE(ngpus)                              \
  case ngpus: {                                         \
    if (force_1stage) {                                 \
@@ -784,6 +799,7 @@ class CustomAllreduce {
      CUDACHECK(cudaIpcCloseMemHandle(ptr));
    }
    cudaFree(dev_curr_hdp_reg);
+    cudaEventDestroy(stopEvent);
  }
 };

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -275,7 +275,13 @@ class CustomAllreduce:
            return None
        if self._IS_CAPTURING:
            if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce(input, registered=False)
+                if envs.VLLM_CUSTOM_CACHE:
+                    return self.all_reduce(input, registered=True)
+                else:
+                    if not self.fully_connected:
+                        return self.all_reduce(input, registered=False)
+                    else:
+                        return self.all_reduce(input, registered=True)
            else:
                # If warm up, mimic the allocation pattern since custom
                # allreduce is out-of-place.

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -215,6 +215,7 @@ if TYPE_CHECKING:
    VLLM_USE_PA_PRINT_PARAM: bool = False 
    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
+    VLLM_CUSTOM_CACHE: bool = False
    VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
    VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
    VLLM_HAS_CONTEXT_DEFAULT: bool = False
@@ -1552,6 +1553,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_PCIE_USE_CUSTOM_ALLREDUCE":
    lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "1"))),
+    # flag to control vllm to use optimized kernels
+    "VLLM_CUSTOM_CACHE":
+    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
    # flag to control vllm to use optimized kernels
    "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":
    lambda: int(os.getenv("VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX", "16")),