[Perf] Avoid pageable HtoD transfer in MinTokensLogitsProcessor (#29826)

Signed-off-by: jthomson04 <jwillthomson19@gmail.com> (cherry picked from commit 1528e079)

[Perf] Avoid pageable HtoD transfer in MinTokensLogitsProcessor (#29826)
Signed-off-by: jthomson04 <jwillthomson19@gmail.com> (cherry picked from commit 1528e079)
5c7c09af · jthomson04 · Kevin H. Luu · 7f718169 · 5c7c09af
Commit 5c7c09af authored Dec 02, 2025 by jthomson04 Committed by Kevin H. Luu Dec 02, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/v1/sample/logits_processor/builtin.py vllm/v1/sample/logits_processor/builtin.py +6 -2

No files found.
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -110,7 +110,7 @@ class MinPLogitsProcessor(LogitsProcessor):
        # Identify valid tokens using threshold comparison
        invalid_token_mask = probability_values < adjusted_min_p
        # Apply mask using boolean indexing
-        logits[invalid_token_mask] = -float("inf")
+        logits.masked_fill_(invalid_token_mask, -float("inf"))
        return logits


@@ -178,6 +178,10 @@ class MinTokensLogitsProcessor(LogitsProcessor):
            self._device_tensor([], torch.int32),
        )

+        self.neg_inf_tensor = torch.tensor(
+            -float("inf"), dtype=torch.float32, device=self.device
+        )
+
    def is_argmax_invariant(self) -> bool:
        """By censoring stop tokens, min-tokens can change the outcome
        of the argmax operation in greedy sampling."""
@@ -229,7 +233,7 @@ class MinTokensLogitsProcessor(LogitsProcessor):
    def apply(self, logits: torch.Tensor) -> torch.Tensor:
        if self.min_toks:
            # Inhibit EOS token for requests which have not reached min length
-            logits[self.logits_slice] = -float("inf")
+            logits.index_put_(self.logits_slice, self.neg_inf_tensor)
        return logits