[Model Runner V2] Use pinned memory for write_contents (#34222)

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>

[Model Runner V2] Use pinned memory for write_contents (#34222)
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
a2443de5 · Woosuk Kwon · GitHub · f84a2a8f · a2443de5
Unverified Commit a2443de5 authored Feb 10, 2026 by Woosuk Kwon Committed by GitHub Feb 10, 2026
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 20 deletions

vllm/v1/worker/gpu/buffer_utils.py vllm/v1/worker/gpu/buffer_utils.py +9 -20

No files found.
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -7,9 +7,11 @@ import numpy as np
 import torch
 from vllm.triton_utils import tl, triton
-from vllm.utils.math_utils import next_power_of_2
 from vllm.utils.platform_utils import is_uva_available
-from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor
+from vllm.utils.torch_utils import (
+    async_tensor_h2d,
+    get_accelerator_view_from_cpu_tensor,
+)
 def async_copy_to_gpu(
@@ -117,6 +119,7 @@ class StagedWriteTensor:
            )
        self.num_rows = size if isinstance(size, int) else size[0]
        self.dtype = dtype
+        self.device = device
        self.max_concurrency = max_concurrency
        if not uva_instead_of_gpu:
@@ -137,8 +140,6 @@ class StagedWriteTensor:
        self.write_indices = new_buffer(self.num_rows, dtype=torch.int32)
        self.write_starts = new_buffer(self.num_rows, dtype=torch.int32)
-        init_size = next_power_of_2(self.num_rows)
-        self.write_contents = new_buffer(init_size, dtype=dtype)
        self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32)
    def stage_write(
@@ -170,21 +171,9 @@ class StagedWriteTensor:
        cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)
        # Special handling for write_contents
-        diff_len = len(self._staged_write_contents)
+        write_contents = async_tensor_h2d(
-        assert isinstance(self.write_contents.size, int)
+            self._staged_write_contents, self.dtype, self.device, pin_memory=True
-        if diff_len > self.write_contents.size:
-            # Re-allocate a larger buffer for the write_contents
-            new_size = next_power_of_2(diff_len)
-            self.write_contents = UvaBufferPool(
-                new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
        )
-            # NOTE(woosuk): Since the previous write_contents buffer is released,
-            # we perform a synchronization here to ensure that all data transfers
-            # involving the old buffer have finished before allocating a new one.
-            # This prevents potential race conditions. The slight overhead is
-            # negligible because the reallocations are infrequent in practice.
-            torch.cuda.synchronize()
-        contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
        # Write diffs to the GPU buffer
        _apply_write_kernel[(n,)](
@@ -192,7 +181,7 @@ class StagedWriteTensor:
            self.gpu.stride(0),
            indices_uva,
            starts_uva,
-            contents_uva,
+            write_contents,
            cu_lens_uva,
            BLOCK_SIZE=1024,
        )