Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a2443de5
Unverified
Commit
a2443de5
authored
Feb 10, 2026
by
Woosuk Kwon
Committed by
GitHub
Feb 10, 2026
Browse files
[Model Runner V2] Use pinned memory for write_contents (#34222)
Signed-off-by:
Woosuk Kwon
<
woosuk@inferact.ai
>
parent
f84a2a8f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
20 deletions
+9
-20
vllm/v1/worker/gpu/buffer_utils.py
vllm/v1/worker/gpu/buffer_utils.py
+9
-20
No files found.
vllm/v1/worker/gpu/buffer_utils.py
View file @
a2443de5
...
@@ -7,9 +7,11 @@ import numpy as np
...
@@ -7,9 +7,11 @@ import numpy as np
import
torch
import
torch
from
vllm.triton_utils
import
tl
,
triton
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.math_utils
import
next_power_of_2
from
vllm.utils.platform_utils
import
is_uva_available
from
vllm.utils.platform_utils
import
is_uva_available
from
vllm.utils.torch_utils
import
get_accelerator_view_from_cpu_tensor
from
vllm.utils.torch_utils
import
(
async_tensor_h2d
,
get_accelerator_view_from_cpu_tensor
,
)
def
async_copy_to_gpu
(
def
async_copy_to_gpu
(
...
@@ -117,6 +119,7 @@ class StagedWriteTensor:
...
@@ -117,6 +119,7 @@ class StagedWriteTensor:
)
)
self
.
num_rows
=
size
if
isinstance
(
size
,
int
)
else
size
[
0
]
self
.
num_rows
=
size
if
isinstance
(
size
,
int
)
else
size
[
0
]
self
.
dtype
=
dtype
self
.
dtype
=
dtype
self
.
device
=
device
self
.
max_concurrency
=
max_concurrency
self
.
max_concurrency
=
max_concurrency
if
not
uva_instead_of_gpu
:
if
not
uva_instead_of_gpu
:
...
@@ -137,8 +140,6 @@ class StagedWriteTensor:
...
@@ -137,8 +140,6 @@ class StagedWriteTensor:
self
.
write_indices
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
self
.
write_indices
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
self
.
write_starts
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
self
.
write_starts
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
init_size
=
next_power_of_2
(
self
.
num_rows
)
self
.
write_contents
=
new_buffer
(
init_size
,
dtype
=
dtype
)
self
.
write_cu_lens
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
self
.
write_cu_lens
=
new_buffer
(
self
.
num_rows
,
dtype
=
torch
.
int32
)
def
stage_write
(
def
stage_write
(
...
@@ -170,21 +171,9 @@ class StagedWriteTensor:
...
@@ -170,21 +171,9 @@ class StagedWriteTensor:
cu_lens_uva
=
self
.
write_cu_lens
.
copy_to_uva
(
self
.
_staged_write_cu_lens
)
cu_lens_uva
=
self
.
write_cu_lens
.
copy_to_uva
(
self
.
_staged_write_cu_lens
)
# Special handling for write_contents
# Special handling for write_contents
diff_len
=
len
(
self
.
_staged_write_contents
)
write_contents
=
async_tensor_h2d
(
assert
isinstance
(
self
.
write_contents
.
size
,
int
)
self
.
_staged_write_contents
,
self
.
dtype
,
self
.
device
,
pin_memory
=
True
if
diff_len
>
self
.
write_contents
.
size
:
# Re-allocate a larger buffer for the write_contents
new_size
=
next_power_of_2
(
diff_len
)
self
.
write_contents
=
UvaBufferPool
(
new_size
,
dtype
=
self
.
dtype
,
max_concurrency
=
self
.
max_concurrency
)
)
# NOTE(woosuk): Since the previous write_contents buffer is released,
# we perform a synchronization here to ensure that all data transfers
# involving the old buffer have finished before allocating a new one.
# This prevents potential race conditions. The slight overhead is
# negligible because the reallocations are infrequent in practice.
torch
.
cuda
.
synchronize
()
contents_uva
=
self
.
write_contents
.
copy_to_uva
(
self
.
_staged_write_contents
)
# Write diffs to the GPU buffer
# Write diffs to the GPU buffer
_apply_write_kernel
[(
n
,)](
_apply_write_kernel
[(
n
,)](
...
@@ -192,7 +181,7 @@ class StagedWriteTensor:
...
@@ -192,7 +181,7 @@ class StagedWriteTensor:
self
.
gpu
.
stride
(
0
),
self
.
gpu
.
stride
(
0
),
indices_uva
,
indices_uva
,
starts_uva
,
starts_uva
,
contents
_uva
,
write_
contents
,
cu_lens_uva
,
cu_lens_uva
,
BLOCK_SIZE
=
1024
,
BLOCK_SIZE
=
1024
,
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment