Merge branch 'v0.9.2-dev-fix-pp+cp' into 'v0.9.2-dev'

fix: 修复deepseek在PP>1开chunkprefill精度问题 See merge request dcutoolkit/deeplearing/vllm!342

Merge branch 'v0.9.2-dev-fix-pp+cp' into 'v0.9.2-dev'
fix: 修复deepseek在PP>1开chunkprefill精度问题 See merge request dcutoolkit/deeplearing/vllm!342
155c8a13 · zhuwenwen · 03ca39bd · 4b752578 · 155c8a13
Commit 155c8a13 authored Jan 06, 2026 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +6 -0

No files found.
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1638,6 +1638,9 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
        # Mask out the sampled tokens that should not be sampled.
        for i in discard_sampled_tokens_req_indices:
            valid_sampled_token_ids[i].clear()
+        if spec_token_ids is not None:
+            for i in discard_sampled_tokens_req_indices:
+                spec_token_ids[i].clear()
        # Cache the sampled tokens in the model runner, so that the scheduler
        # doesn't need to send them back.
@@ -3519,6 +3522,9 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
        # Mask out the sampled tokens that should not be sampled.
        for i in discard_sampled_tokens_req_indices:
            valid_sampled_token_ids[i].clear()
+        if spec_token_ids is not None:
+            for i in discard_sampled_tokens_req_indices:
+                spec_token_ids[i].clear()
        # Cache the sampled tokens in the model runner, so that the scheduler
        # doesn't need to send them back.