Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
efeaac92
Unverified
Commit
efeaac92
authored
Jan 06, 2026
by
tianshu-Michael-yu
Committed by
GitHub
Jan 07, 2026
Browse files
[Bugfix] Fix race condition in async-scheduling for vlm model (#31841)
Signed-off-by:
Tianshu Yu
<
tianshuyu.formal@gmail.com
>
parent
55caa605
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
3 deletions
+14
-3
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+14
-3
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
efeaac92
...
...
@@ -554,7 +554,13 @@ class GPUModelRunner(
# Only relevant for multimodal models
if
self
.
supports_mm_inputs
:
self
.
is_mm_embed
=
self
.
_make_buffer
(
self
.
max_num_tokens
,
dtype
=
torch
.
bool
)
# Double buffer to avoid race condition: previous iteration's async
# copy may still be reading from CPU while current iteration writes.
self
.
is_mm_embed_buffers
=
[
self
.
_make_buffer
(
self
.
max_num_tokens
,
dtype
=
torch
.
bool
),
self
.
_make_buffer
(
self
.
max_num_tokens
,
dtype
=
torch
.
bool
),
]
self
.
is_mm_embed_idx
=
0
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if
self
.
uses_mrope
:
...
...
@@ -2337,8 +2343,13 @@ class GPUModelRunner(
)
->
tuple
[
list
[
torch
.
Tensor
],
torch
.
Tensor
]:
total_num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
# Swap to the other buffer to avoid race condition with previous
# iteration's async copy that may still be reading from CPU.
self
.
is_mm_embed_idx
=
1
-
self
.
is_mm_embed_idx
is_mm_embed_buf
=
self
.
is_mm_embed_buffers
[
self
.
is_mm_embed_idx
]
mm_embeds
=
list
[
torch
.
Tensor
]()
is_mm_embed
=
self
.
is_mm_embed
.
cpu
is_mm_embed
=
is_mm_embed
_buf
.
cpu
is_mm_embed
[:
total_num_scheduled_tokens
]
=
False
req_start_idx
=
0
...
...
@@ -2416,7 +2427,7 @@ class GPUModelRunner(
mm_embeds
.
extend
(
mm_embeds_req
)
req_start_idx
+=
num_scheduled_tokens
is_mm_embed
=
self
.
is_mm_embed
.
copy_to_gpu
(
total_num_scheduled_tokens
)
is_mm_embed
=
is_mm_embed
_buf
.
copy_to_gpu
(
total_num_scheduled_tokens
)
if
should_sync_mrope_positions
:
self
.
_calc_mrope_positions
(
scheduler_output
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment