[V1] Remove pin_memory() in async_copy_to_gpu to fix sporadic stalls (#37006)

Signed-off-by: Sebastien Beurnier <sbeurnier@together.ai>

[V1] Remove pin_memory() in async_copy_to_gpu to fix sporadic stalls (#37006)
Signed-off-by: Sebastien Beurnier <sbeurnier@together.ai>
a116f969 · sbeurnier · GitHub · 092ace9e · a116f969
Unverified Commit a116f969 authored Mar 14, 2026 by sbeurnier Committed by GitHub Mar 14, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 6 deletions

vllm/v1/worker/gpu/buffer_utils.py vllm/v1/worker/gpu/buffer_utils.py +4 -6

No files found.
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -27,12 +27,10 @@ def async_copy_to_gpu(
        assert device is not None
        out = torch.empty_like(x, device=device)

-    # CPU-to-CPU copy
-    tmp = x.pin_memory()
-    assert tmp is not x
-
-    # CPU-to-GPU copy
-    return out.copy_(tmp, non_blocking=True)
+    # Copy directly to GPU — explicit pin_memory() causes sporadic stalls
+    # under high concurrency due to CUDA driver contention. The driver
+    # handles the transfer efficiently without manual pinning.
+    return out.copy_(x, non_blocking=True)


 class UvaBuffer: