[Core] Small simplification in `GPUModelRunner._update_states()` (#26508)

Signed-off-by: Nick Hill <nhill@redhat.com>

[Core] Small simplification in `GPUModelRunner._update_states()` (#26508)
Signed-off-by: Nick Hill <nhill@redhat.com>
aafb99a4 · Nick Hill · GitHub · 757fa4a4 · aafb99a4
Unverified Commit aafb99a4 authored Oct 09, 2025 by Nick Hill Committed by GitHub Oct 10, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 7 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +2 -7

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -708,6 +708,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            # Update the cached states.
            req_state.num_computed_tokens = num_computed_tokens
+            req_index = self.input_batch.req_id_to_index.get(req_id)
            if not is_last_rank:
                # When using PP, the scheduler sends the sampled tokens back,
@@ -728,19 +729,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                # Some output tokens were discarded due to a sync-KV-load
                # failure. Align the cached state.
                del req_state.output_token_ids[num_output_tokens:]
-                req_index = self.input_batch.req_id_to_index.get(req_id)
                if req_index is not None:
-                    old_end_idx = self.input_batch.num_tokens_no_spec[req_index]
                    end_idx = (
                        self.input_batch.num_prompt_tokens[req_index]
                        + num_output_tokens
                    )
                    self.input_batch.num_tokens[req_index] = end_idx
                    self.input_batch.num_tokens_no_spec[req_index] = end_idx
-                    self.input_batch.is_token_ids[req_index, end_idx:old_end_idx] = (
-                        False
-                    )
            # Update the block IDs.
            if not resumed_from_preemption:
@@ -749,12 +744,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    for block_ids, new_ids in zip(req_state.block_ids, new_block_ids):
                        block_ids.extend(new_ids)
            else:
+                assert req_index is None
                assert new_block_ids is not None
                # The request is resumed from preemption.
                # Replace the existing block IDs with the new ones.
                req_state.block_ids = new_block_ids
-            req_index = self.input_batch.req_id_to_index.get(req_id)
            if req_index is None:
                # The request is not in the persistent batch.
                # The request was either preempted and resumed later, or was not