Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
aafb99a4
Unverified
Commit
aafb99a4
authored
Oct 09, 2025
by
Nick Hill
Committed by
GitHub
Oct 10, 2025
Browse files
[Core] Small simplification in `GPUModelRunner._update_states()` (#26508)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
757fa4a4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
7 deletions
+2
-7
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-7
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
aafb99a4
...
@@ -708,6 +708,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -708,6 +708,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Update the cached states.
# Update the cached states.
req_state
.
num_computed_tokens
=
num_computed_tokens
req_state
.
num_computed_tokens
=
num_computed_tokens
req_index
=
self
.
input_batch
.
req_id_to_index
.
get
(
req_id
)
if
not
is_last_rank
:
if
not
is_last_rank
:
# When using PP, the scheduler sends the sampled tokens back,
# When using PP, the scheduler sends the sampled tokens back,
...
@@ -728,19 +729,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -728,19 +729,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Some output tokens were discarded due to a sync-KV-load
# Some output tokens were discarded due to a sync-KV-load
# failure. Align the cached state.
# failure. Align the cached state.
del
req_state
.
output_token_ids
[
num_output_tokens
:]
del
req_state
.
output_token_ids
[
num_output_tokens
:]
req_index
=
self
.
input_batch
.
req_id_to_index
.
get
(
req_id
)
if
req_index
is
not
None
:
if
req_index
is
not
None
:
old_end_idx
=
self
.
input_batch
.
num_tokens_no_spec
[
req_index
]
end_idx
=
(
end_idx
=
(
self
.
input_batch
.
num_prompt_tokens
[
req_index
]
self
.
input_batch
.
num_prompt_tokens
[
req_index
]
+
num_output_tokens
+
num_output_tokens
)
)
self
.
input_batch
.
num_tokens
[
req_index
]
=
end_idx
self
.
input_batch
.
num_tokens
[
req_index
]
=
end_idx
self
.
input_batch
.
num_tokens_no_spec
[
req_index
]
=
end_idx
self
.
input_batch
.
num_tokens_no_spec
[
req_index
]
=
end_idx
self
.
input_batch
.
is_token_ids
[
req_index
,
end_idx
:
old_end_idx
]
=
(
False
)
# Update the block IDs.
# Update the block IDs.
if
not
resumed_from_preemption
:
if
not
resumed_from_preemption
:
...
@@ -749,12 +744,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -749,12 +744,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
for
block_ids
,
new_ids
in
zip
(
req_state
.
block_ids
,
new_block_ids
):
for
block_ids
,
new_ids
in
zip
(
req_state
.
block_ids
,
new_block_ids
):
block_ids
.
extend
(
new_ids
)
block_ids
.
extend
(
new_ids
)
else
:
else
:
assert
req_index
is
None
assert
new_block_ids
is
not
None
assert
new_block_ids
is
not
None
# The request is resumed from preemption.
# The request is resumed from preemption.
# Replace the existing block IDs with the new ones.
# Replace the existing block IDs with the new ones.
req_state
.
block_ids
=
new_block_ids
req_state
.
block_ids
=
new_block_ids
req_index
=
self
.
input_batch
.
req_id_to_index
.
get
(
req_id
)
if
req_index
is
None
:
if
req_index
is
None
:
# The request is not in the persistent batch.
# The request is not in the persistent batch.
# The request was either preempted and resumed later, or was not
# The request was either preempted and resumed later, or was not
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment