Unverified Commit a2443de5 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[Model Runner V2] Use pinned memory for write_contents (#34222)


Signed-off-by: default avatarWoosuk Kwon <woosuk@inferact.ai>
parent f84a2a8f
...@@ -7,9 +7,11 @@ import numpy as np ...@@ -7,9 +7,11 @@ import numpy as np
import torch import torch
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils.math_utils import next_power_of_2
from vllm.utils.platform_utils import is_uva_available from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor from vllm.utils.torch_utils import (
async_tensor_h2d,
get_accelerator_view_from_cpu_tensor,
)
def async_copy_to_gpu( def async_copy_to_gpu(
...@@ -117,6 +119,7 @@ class StagedWriteTensor: ...@@ -117,6 +119,7 @@ class StagedWriteTensor:
) )
self.num_rows = size if isinstance(size, int) else size[0] self.num_rows = size if isinstance(size, int) else size[0]
self.dtype = dtype self.dtype = dtype
self.device = device
self.max_concurrency = max_concurrency self.max_concurrency = max_concurrency
if not uva_instead_of_gpu: if not uva_instead_of_gpu:
...@@ -137,8 +140,6 @@ class StagedWriteTensor: ...@@ -137,8 +140,6 @@ class StagedWriteTensor:
self.write_indices = new_buffer(self.num_rows, dtype=torch.int32) self.write_indices = new_buffer(self.num_rows, dtype=torch.int32)
self.write_starts = new_buffer(self.num_rows, dtype=torch.int32) self.write_starts = new_buffer(self.num_rows, dtype=torch.int32)
init_size = next_power_of_2(self.num_rows)
self.write_contents = new_buffer(init_size, dtype=dtype)
self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32) self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32)
def stage_write( def stage_write(
...@@ -170,21 +171,9 @@ class StagedWriteTensor: ...@@ -170,21 +171,9 @@ class StagedWriteTensor:
cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens) cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens)
# Special handling for write_contents # Special handling for write_contents
diff_len = len(self._staged_write_contents) write_contents = async_tensor_h2d(
assert isinstance(self.write_contents.size, int) self._staged_write_contents, self.dtype, self.device, pin_memory=True
if diff_len > self.write_contents.size: )
# Re-allocate a larger buffer for the write_contents
new_size = next_power_of_2(diff_len)
self.write_contents = UvaBufferPool(
new_size, dtype=self.dtype, max_concurrency=self.max_concurrency
)
# NOTE(woosuk): Since the previous write_contents buffer is released,
# we perform a synchronization here to ensure that all data transfers
# involving the old buffer have finished before allocating a new one.
# This prevents potential race conditions. The slight overhead is
# negligible because the reallocations are infrequent in practice.
torch.cuda.synchronize()
contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents)
# Write diffs to the GPU buffer # Write diffs to the GPU buffer
_apply_write_kernel[(n,)]( _apply_write_kernel[(n,)](
...@@ -192,7 +181,7 @@ class StagedWriteTensor: ...@@ -192,7 +181,7 @@ class StagedWriteTensor:
self.gpu.stride(0), self.gpu.stride(0),
indices_uva, indices_uva,
starts_uva, starts_uva,
contents_uva, write_contents,
cu_lens_uva, cu_lens_uva,
BLOCK_SIZE=1024, BLOCK_SIZE=1024,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment