Merge branch 'v0.9.2-dev-custom' into 'v0.9.2-dev'

解决custom cudagraph模式需要拷贝的问题，这个需要配合dtk进行使用。 See merge request dcutoolkit/deeplearing/vllm!306

Merge branch 'v0.9.2-dev-custom' into 'v0.9.2-dev'
解决custom cudagraph模式需要拷贝的问题，这个需要配合dtk进行使用。 See merge request dcutoolkit/deeplearing/vllm!306
06106338 · zhuwenwen · afdabfbe · 651925e8 · 06106338 · 06106338
Commit 06106338 authored Dec 18, 2025 by zhuwenwen
Showing with 27 additions and 3 deletions

csrc/custom_all_reduce.cuh csrc/custom_all_reduce.cuh +18 -2

vllm/distributed/device_communicators/custom_all_reduce.py vllm/distributed/device_communicators/custom_all_reduce.py +4 -1

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -902,6 +902,7 @@ class CustomAllreduce {
  std::map<IPC_KEY, char*> ipc_handles_;
  uint32_t** dev_curr_hdp_reg;
+  hipEvent_t stopEvent;
  /**
   * Signals are an array of ipc-enabled buffers from all ranks.
   * For each of the buffer, the layout is as follows:
@@ -930,6 +931,7 @@ class CustomAllreduce {
        hipDeviceGetAttribute((int*)&dev_curr_hdp_reg[i], hipDeviceAttributeHdpMemFlushCntl, i);
      }
    }
+    cudaEventCreate(&stopEvent);
  }
  char* open_ipc_handle(const void* ipc_handle) {
@@ -1303,9 +1305,22 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+// #define KL(ngpus, name)                                                       \
+//   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+//                                                  rank_, size);
 #define KL(ngpus, name)                                                       \
-  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+  {                                                                           \
-                                                 rank_, size);
+    void* kernelArgs[] = {                                                    \
+      &ptrs, &sg_, &self_sg_, &output, &rank_, &size                          \
+    };                                                                        \
+    hipExtLaunchKernel(                                                       \
+      (void*)name<T, ngpus>,                                                  \
+      blocks, threads,                                                        \
+      kernelArgs, 0,                                                          \
+      stream, nullptr, stopEvent, 0                                           \
+    );                                                                        \
+  }
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
@@ -1342,6 +1357,7 @@ class CustomAllreduce {
      CUDACHECK(cudaIpcCloseMemHandle(ptr));
    }
    cudaFree(dev_curr_hdp_reg);
+    cudaEventDestroy(stopEvent);
  }
 };

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -264,7 +264,10 @@ class CustomAllreduce:
            return None
        if self._IS_CAPTURING:
            if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce(input, registered=False)
+                if not envs.VLLM_CUSTOM_CACHE:
+                    return self.all_reduce(input, registered=False)
+                else:
+                    return self.all_reduce(input, registered=True)
            else:
                # If warm up, mimic the allocation pattern since custom
                # allreduce is out-of-place.

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -152,6 +152,7 @@ if TYPE_CHECKING:
    VLLM_USE_PA_PRINT_PARAM: bool = False 
    VLLM_SPEC_DECODE_EAGER: bool = False
    VLLM_PCIE_USE_CUSTOM_ALLREDUCE: bool = False
+    VLLM_CUSTOM_CACHE: bool = False
    VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX: int = 16
    VLLM_ENFORCE_EAGER_BS_THRESHOLD: Optional[int] = None
    VLLM_HAS_CONTEXT_DEFAULT: bool = False
@@ -1065,6 +1066,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # flag to control vllm to use optimized kernels
    "VLLM_PCIE_USE_CUSTOM_ALLREDUCE":
    lambda: bool(int(os.environ.get("VLLM_PCIE_USE_CUSTOM_ALLREDUCE", "0"))),
+    # flag to control vllm to use optimized kernels
+    "VLLM_CUSTOM_CACHE":
+    lambda: bool(int(os.environ.get("VLLM_CUSTOM_CACHE", "0"))),
    # flag to control vllm to use optimized kernels
    "VLLM_CUSTOM_ALLREDUCE_SUPPORTED_WORLDSIZE_MAX":