去掉宽松mtp中的隐式同步

03e822d1 · 王敏 · deae0a22 · 03e822d1 · 03e822d1
Commit 03e822d1 authored Nov 27, 2025 by 王敏
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 6 deletions

vllm/v1/sample/rejection_sampler_opt.py vllm/v1/sample/rejection_sampler_opt.py +4 -5

vllm/v1/spec_decode/utils.py vllm/v1/spec_decode/utils.py +7 -1

No files found.
--- a/vllm/v1/sample/rejection_sampler_opt.py
+++ b/vllm/v1/sample/rejection_sampler_opt.py
@@ -87,12 +87,8 @@ class OptRejectionSampler(nn.Module):
        assert metadata.max_spec_len <= MAX_SPEC_LEN
        target_probs = target_logits.softmax(dim=-1, dtype=torch.float32)
-        draft_token_ids = metadata.draft_token_ids
-        mask = draft_token_ids.eq(-1).to(torch.bool)
-        draft_token_ids = torch.where(mask, 0, draft_token_ids).to(torch.long)  # 兼容第一次decode
        output_token_ids = rejection_sample(
-            draft_token_ids,
+            metadata.draft_token_ids,
            metadata.num_draft_tokens,
            metadata.max_spec_len,
            metadata.cu_num_draft_tokens,
@@ -225,6 +221,8 @@ def rejection_random_sample_kernel(
    for pos in range(num_draft_tokens):
        if not rejected:
            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            if draft_token_id < 0:
+                draft_token_id = 0
            if NO_DRAFT_PROBS:
                draft_prob = 1
            else:
@@ -235,6 +233,7 @@ def rejection_random_sample_kernel(
                                  (start_idx + pos) * vocab_size +
                                  draft_token_id)
+            draft_token_id = draft_token_id.to(tl.int64)
            target_token_id = tl.load(target_token_ids_ptr + (start_idx + pos))
            target_token_id = target_token_id.to(tl.int64)
            uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos)

--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -6,6 +6,7 @@ import torch
 from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton
+from vllm.utils import async_tensor_h2d
 _SAMPLING_EPS = 1e-5
@@ -80,5 +81,10 @@ class DraftProbs(ABC):  # type: ignore[call-arg]
    def get_probs(self, req_ids: list[str]):
        index = [self._req_ids.index(req_id) for req_id in req_ids]
-        return self.draft_probs[index]
+        index_tensor = async_tensor_h2d(
+                    index,
+                    dtype=torch.int32,
+                    target_device=self.draft_probs.device,
+                    pin_memory=True)
+        return self.draft_probs[index_tensor]