Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
14c14327
Unverified
Commit
14c14327
authored
Sep 19, 2025
by
Nick Hill
Committed by
GitHub
Sep 19, 2025
Browse files
[BugFix] Fix async scheduling CPU tensor race take 2 (#25279)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
ee7a66dd
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
22 deletions
+32
-22
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+32
-22
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
14c14327
...
@@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
**
self
.
_init_model_kwargs
(
num_scheduled_tokens
),
**
self
.
_init_model_kwargs
(
num_scheduled_tokens
),
**
self
.
_extract_mm_kwargs
(
scheduler_output
),
**
self
.
_extract_mm_kwargs
(
scheduler_output
),
}
}
elif
(
self
.
enable_prompt_embeds
and
get_pp_group
().
is_first_rank
)
:
elif
self
.
enable_prompt_embeds
and
get_pp_group
().
is_first_rank
:
# Get the input embeddings for the tokens that are not input embeds,
# Get the input embeddings for the tokens that are not input embeds,
# then put them into the appropriate positions.
# then put them into the appropriate positions.
# TODO(qthequartermasterman): Since even when prompt embeds are
# TODO(qthequartermasterman): Since even when prompt embeds are
...
@@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
invalid_req_indices
,
invalid_req_indices
,
)
)
@
contextmanager
def
synchronize_input_prep
(
self
):
if
self
.
prepare_inputs_event
is
None
:
yield
return
# Ensure prior step has finished with reused CPU tensors.
# This is required in the async scheduling case because
# the CPU->GPU transfer happens async.
self
.
prepare_inputs_event
.
synchronize
()
try
:
yield
finally
:
self
.
prepare_inputs_event
.
record
()
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
execute_model
(
def
execute_model
(
self
,
self
,
...
@@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
Union
[
ModelRunnerOutput
,
AsyncModelRunnerOutput
,
IntermediateTensors
]:
)
->
Union
[
ModelRunnerOutput
,
AsyncModelRunnerOutput
,
IntermediateTensors
]:
with
record_function_or_nullcontext
(
"Preprocess"
):
with
record_function_or_nullcontext
(
"Preprocess"
):
with
self
.
synchronize_input_prep
():
# Update persistent batch states.
self
.
_update_states
(
scheduler_output
)
self
.
_update_states
(
scheduler_output
)
if
not
scheduler_output
.
total_num_scheduled_tokens
:
if
not
scheduler_output
.
total_num_scheduled_tokens
:
if
not
has_kv_transfer_group
():
if
not
has_kv_transfer_group
():
# Return empty ModelRunnerOutput if
there's
no work to do.
# Return empty ModelRunnerOutput if no work to do.
return
EMPTY_MODEL_RUNNER_OUTPUT
return
EMPTY_MODEL_RUNNER_OUTPUT
return
self
.
kv_connector_no_forward
(
scheduler_output
,
return
self
.
kv_connector_no_forward
(
self
.
vllm_config
)
scheduler_output
,
self
.
vllm_config
)
if
self
.
cache_config
.
kv_sharing_fast_prefill
:
if
self
.
cache_config
.
kv_sharing_fast_prefill
:
assert
not
self
.
input_batch
.
num_prompt_logprobs
,
(
assert
not
self
.
input_batch
.
num_prompt_logprobs
,
(
"--kv-sharing-fast-prefill produces incorrect
logprobs for
"
"--kv-sharing-fast-prefill produces incorrect "
"
prompt tokens, tokens, please disable
it when the requests
"
"logprobs for
prompt tokens, tokens, please disable "
"
need prompt logprobs"
)
"it when the requests
need prompt logprobs"
)
if
self
.
prepare_inputs_event
is
not
None
:
# Ensure prior step has finished with reused CPU tensors.
self
.
prepare_inputs_event
.
synchronize
()
try
:
# Prepare the decoder inputs.
# Prepare the decoder inputs.
(
attn_metadata
,
logits_indices
,
spec_decode_metadata
,
(
attn_metadata
,
logits_indices
,
spec_decode_metadata
,
num_scheduled_tokens_np
,
spec_decode_common_attn_metadata
,
num_scheduled_tokens_np
,
spec_decode_common_attn_metadata
,
max_query_len
,
ubatch_slices
,
num_tokens_after_padding
max_query_len
,
ubatch_slices
,
num_tokens_after_padding
)
=
self
.
_prepare_inputs
(
scheduler_output
)
)
=
self
.
_prepare_inputs
(
scheduler_output
)
finally
:
if
self
.
prepare_inputs_event
is
not
None
:
self
.
prepare_inputs_event
.
record
()
(
(
num_scheduled_tokens
,
num_scheduled_tokens
,
num_input_tokens
,
num_input_tokens
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment