[Bugfix] Runtime driver check for cuMemcpyBatchAsync in swap_blocks_batch (#38919)

Signed-off-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>

[Bugfix] Runtime driver check for cuMemcpyBatchAsync in swap_blocks_batch (#38919)
Signed-off-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Itay Etelis <itay.etelis@ibm.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
bd8bd523 · Itay Etelis · GitHub · 59b2f7b6 · bd8bd523
Unverified Commit bd8bd523 authored Apr 11, 2026 by Itay Etelis Committed by GitHub Apr 11, 2026
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 30 deletions

csrc/cache_kernels.cu csrc/cache_kernels.cu +42 -30

No files found.
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -104,29 +104,41 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
  static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
  static_assert(sizeof(size_t) == sizeof(int64_t));
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+  // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
+  // binaries compiled with CUDA 12.8+ still work on older drivers, and
+  // we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
+  // The function pointer is cached after the first call.
+  using BatchFn =
+      CUresult (*)(CUdeviceptr*, CUdeviceptr*, size_t*, size_t,
+                   CUmemcpyAttributes*, size_t*, size_t, size_t*, CUstream);
+  static BatchFn batch_fn = []() -> BatchFn {
+    CUdriverProcAddressQueryResult sym_status;
+    void* fn_ptr = nullptr;
+    CUresult res = cuGetProcAddress("cuMemcpyBatchAsync", &fn_ptr, 12080,
+                                    CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
+    if (res != CUDA_SUCCESS || fn_ptr == nullptr) {
+      return nullptr;
+    }
+    return reinterpret_cast<BatchFn>(fn_ptr);
+  }();
+
+  if (batch_fn != nullptr) {
    CUmemcpyAttributes attr = {};
    attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
    size_t attrs_idx = 0;
-  #if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
-  CUresult result = cuMemcpyBatchAsync(
-      reinterpret_cast<CUdeviceptr*>(dst_data),
-      reinterpret_cast<CUdeviceptr*>(src_data),
-      reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
-      &attrs_idx, 1, static_cast<CUstream>(stream));
-  TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed with error ",
-              result);
-  #else
    size_t fail_idx = 0;
-  CUresult result = cuMemcpyBatchAsync(
-      reinterpret_cast<CUdeviceptr*>(dst_data),
+    CUresult result = batch_fn(reinterpret_cast<CUdeviceptr*>(dst_data),
                               reinterpret_cast<CUdeviceptr*>(src_data),
-      reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
-      &attrs_idx, 1, &fail_idx, static_cast<CUstream>(stream));
+                               reinterpret_cast<size_t*>(size_data),
+                               static_cast<size_t>(n), &attr, &attrs_idx, 1,
+                               &fail_idx, static_cast<CUstream>(stream));
    TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
                fail_idx, " with error ", result);
-  #endif
-#else
-  // Fallback for CUDA < 12.8 and ROCm: individual async copies.
+  } else
+#endif
+  {
+    // Fallback for CUDA < 12.8, older drivers, and ROCm:
+    // individual async copies.
    // cudaMemcpyDefault lets the driver infer direction from pointer types.
    for (int64_t i = 0; i < n; i++) {
      cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
@@ -134,7 +146,7 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
                      static_cast<size_t>(size_data[i]), cudaMemcpyDefault,
                      stream);
    }
-#endif
+  }
 }

 namespace vllm {