"tests/distributed/test_node_count.py" did not exist on "c4bd03c7c5672b6a5d3d6839339853e04fe15127"
Unverified Commit bd8bd523 authored by Itay Etelis's avatar Itay Etelis Committed by GitHub
Browse files

[Bugfix] Runtime driver check for cuMemcpyBatchAsync in swap_blocks_batch (#38919)


Signed-off-by: default avatarItay Etelis <itay.etelis@ibm.com>
Co-authored-by: default avatarItay Etelis <itay.etelis@ibm.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
parent 59b2f7b6
......@@ -104,29 +104,41 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
static_assert(sizeof(size_t) == sizeof(int64_t));
#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
// Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
// binaries compiled with CUDA 12.8+ still work on older drivers, and
// we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
// The function pointer is cached after the first call.
using BatchFn =
CUresult (*)(CUdeviceptr*, CUdeviceptr*, size_t*, size_t,
CUmemcpyAttributes*, size_t*, size_t, size_t*, CUstream);
static BatchFn batch_fn = []() -> BatchFn {
CUdriverProcAddressQueryResult sym_status;
void* fn_ptr = nullptr;
CUresult res = cuGetProcAddress("cuMemcpyBatchAsync", &fn_ptr, 12080,
CU_GET_PROC_ADDRESS_DEFAULT, &sym_status);
if (res != CUDA_SUCCESS || fn_ptr == nullptr) {
return nullptr;
}
return reinterpret_cast<BatchFn>(fn_ptr);
}();
if (batch_fn != nullptr) {
CUmemcpyAttributes attr = {};
attr.srcAccessOrder = CU_MEMCPY_SRC_ACCESS_ORDER_STREAM;
size_t attrs_idx = 0;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000
CUresult result = cuMemcpyBatchAsync(
reinterpret_cast<CUdeviceptr*>(dst_data),
reinterpret_cast<CUdeviceptr*>(src_data),
reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
&attrs_idx, 1, static_cast<CUstream>(stream));
TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed with error ",
result);
#else
size_t fail_idx = 0;
CUresult result = cuMemcpyBatchAsync(
reinterpret_cast<CUdeviceptr*>(dst_data),
CUresult result = batch_fn(reinterpret_cast<CUdeviceptr*>(dst_data),
reinterpret_cast<CUdeviceptr*>(src_data),
reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
&attrs_idx, 1, &fail_idx, static_cast<CUstream>(stream));
reinterpret_cast<size_t*>(size_data),
static_cast<size_t>(n), &attr, &attrs_idx, 1,
&fail_idx, static_cast<CUstream>(stream));
TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
fail_idx, " with error ", result);
#endif
#else
// Fallback for CUDA < 12.8 and ROCm: individual async copies.
} else
#endif
{
// Fallback for CUDA < 12.8, older drivers, and ROCm:
// individual async copies.
// cudaMemcpyDefault lets the driver infer direction from pointer types.
for (int64_t i = 0; i < n; i++) {
cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
......@@ -134,7 +146,7 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
static_cast<size_t>(size_data[i]), cudaMemcpyDefault,
stream);
}
#endif
}
}
namespace vllm {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment