fix: update_state,优化性能，去除冗余操作

e9cfa85e · jujl1 · be41974c · e9cfa85e
Commit e9cfa85e authored Jan 17, 2026 by jujl1
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 8 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +5 -8

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -513,7 +513,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
                # Add the sampled token(s) from the previous step (if any).
                # This doesn't include "unverified" tokens like spec tokens.
                num_new_tokens = len(new_token_ids)
-                if num_new_tokens > 0:
+                if num_new_tokens == 1:
+                    req_state.output_token_ids.append(new_token_ids[-1])
+                elif num_new_tokens > 0:
                    req_state.output_token_ids.extend(
                        new_token_ids)
            if len(spec_token_ids) > 0:
@@ -535,11 +537,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
                # The request is not in the persistent batch.
                # The request was either preempted and resumed later, or was not
                # scheduled in the previous step and needs to be added again.
-                if not is_last_rank:
-                    req_state = self.requests[req_id]
-                    self.input_batch.add_request(req_state)
-                    req_index = self.input_batch.req_id_to_index.get(req_id)
-                else:
                req_ids_to_add.append(req_id)
                continue