Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
40b2f1c3
Unverified
Commit
40b2f1c3
authored
Feb 19, 2026
by
Nick Hill
Committed by
GitHub
Feb 19, 2026
Browse files
[Model Runner V2] Minor CPU optimizations (#34856)
Signed-off-by:
Nick Hill
<
nickhill123@gmail.com
>
parent
648951a9
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
10 deletions
+28
-10
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/shm_broadcast.py
+2
-2
vllm/v1/worker/gpu/async_utils.py
vllm/v1/worker/gpu/async_utils.py
+16
-3
vllm/v1/worker/gpu/buffer_utils.py
vllm/v1/worker/gpu/buffer_utils.py
+3
-5
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+7
-0
No files found.
vllm/distributed/device_communicators/shm_broadcast.py
View file @
40b2f1c3
...
...
@@ -513,8 +513,8 @@ class MessageQueue:
assert
self
.
_is_local_reader
,
"Only readers can acquire read"
start_time
=
time
.
monotonic
()
n_warning
=
1
w
hile
True
:
w
ith
self
.
buffer
.
get_metadata
(
self
.
current_idx
)
as
metadata_buffer
:
w
ith
self
.
buffer
.
get_metadata
(
self
.
current_idx
)
as
metadata_buffer
:
w
hile
True
:
# Memory fence ensures we see the latest writes from the writer.
# Without this, we may read stale flags from our CPU cache
# and spin indefinitely even though writer has updated them.
...
...
vllm/v1/worker/gpu/async_utils.py
View file @
40b2f1c3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
numpy
as
np
import
torch
...
...
@@ -14,6 +15,7 @@ class AsyncOutput(AsyncModelRunnerOutput):
model_runner_output
:
ModelRunnerOutput
,
sampler_output
:
SamplerOutput
,
num_sampled_tokens
:
torch
.
Tensor
,
main_stream
:
torch
.
cuda
.
Stream
,
copy_stream
:
torch
.
cuda
.
Stream
,
copy_event
:
torch
.
cuda
.
Event
,
):
...
...
@@ -25,9 +27,8 @@ class AsyncOutput(AsyncModelRunnerOutput):
self
.
num_sampled_tokens
=
num_sampled_tokens
self
.
copy_event
=
copy_event
default_stream
=
torch
.
cuda
.
current_stream
()
with
torch
.
cuda
.
stream
(
copy_stream
):
copy_stream
.
wait_stream
(
default_stream
)
with
stream
(
copy_stream
,
main_stream
):
copy_stream
.
wait_stream
(
main_stream
)
self
.
sampled_token_ids
=
async_copy_to_np
(
sampler_output
.
sampled_token_ids
)
self
.
logprobs_tensors
:
LogprobsTensors
|
None
=
None
...
...
@@ -71,3 +72,15 @@ class AsyncOutput(AsyncModelRunnerOutput):
def
async_copy_to_np
(
x
:
torch
.
Tensor
)
->
np
.
ndarray
:
return
x
.
to
(
"cpu"
,
non_blocking
=
True
).
numpy
()
@
contextlib
.
contextmanager
def
stream
(
to_stream
:
torch
.
cuda
.
Stream
,
from_stream
:
torch
.
cuda
.
Stream
):
"""Lightweight version of torch.cuda.stream() context manager which
avoids current_stream and device lookups.
"""
try
:
torch
.
cuda
.
set_stream
(
to_stream
)
yield
finally
:
torch
.
cuda
.
set_stream
(
from_stream
)
vllm/v1/worker/gpu/buffer_utils.py
View file @
40b2f1c3
...
...
@@ -22,7 +22,6 @@ def async_copy_to_gpu(
if
isinstance
(
x
,
np
.
ndarray
):
x
=
torch
.
from_numpy
(
x
)
assert
x
.
is_cpu
assert
not
x
.
is_pinned
()
if
out
is
None
:
assert
device
is
not
None
...
...
@@ -30,6 +29,8 @@ def async_copy_to_gpu(
# CPU-to-CPU copy
tmp
=
x
.
pin_memory
()
assert
tmp
is
not
x
# CPU-to-GPU copy
return
out
.
copy_
(
tmp
,
non_blocking
=
True
)
...
...
@@ -75,11 +76,8 @@ class UvaBufferPool:
out
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
uva
=
self
.
copy_to_uva
(
x
)
if
out
is
None
:
# CPU-to-GPU copy
return
uva
.
clone
()
# CPU-to-GPU copy
return
out
.
copy_
(
uva
,
non_blocking
=
True
)
return
uva
.
clone
()
if
out
is
None
else
out
.
copy_
(
uva
,
non_blocking
=
True
)
class
UvaBackedTensor
:
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
40b2f1c3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
functools
import
gc
import
time
from
copy
import
deepcopy
...
...
@@ -239,6 +240,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def
get_model
(
self
)
->
nn
.
Module
:
return
self
.
model
@
functools
.
cached_property
def
main_stream
(
self
)
->
torch
.
cuda
.
Stream
:
# Cache the default CUDA stream to avoid lookup overhead.
return
torch
.
cuda
.
current_stream
(
self
.
device
)
def
get_kv_cache_spec
(
self
):
return
get_kv_cache_spec
(
self
.
vllm_config
)
...
...
@@ -1065,6 +1071,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
model_runner_output
=
model_runner_output
,
sampler_output
=
sampler_output
,
num_sampled_tokens
=
num_sampled
,
main_stream
=
self
.
main_stream
,
copy_stream
=
self
.
output_copy_stream
,
copy_event
=
self
.
output_copy_event
,
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment