fix: 修复deepseek在PP>1开chunkprefill精度问题

4b752578 · jujl1 · 03ca39bd · 4b752578
Commit 4b752578 authored Jan 06, 2026 by jujl1
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +6 -0

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1638,6 +1638,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
        # Mask out the sampled tokens that should not be sampled.
        for i in discard_sampled_tokens_req_indices:
            valid_sampled_token_ids[i].clear()
+        if spec_token_ids is not None:
+            for i in discard_sampled_tokens_req_indices:
+                spec_token_ids[i].clear()

        # Cache the sampled tokens in the model runner, so that the scheduler
        # doesn't need to send them back.
@@ -3519,6 +3522,9 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
        # Mask out the sampled tokens that should not be sampled.
        for i in discard_sampled_tokens_req_indices:
            valid_sampled_token_ids[i].clear()
+        if spec_token_ids is not None:
+            for i in discard_sampled_tokens_req_indices:
+                spec_token_ids[i].clear()

        # Cache the sampled tokens in the model runner, so that the scheduler
        # doesn't need to send them back.