[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)

Signed-off-by: Lu Fang <lufang@fb.com>

[Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)
Signed-off-by: Lu Fang <lufang@fb.com>
8d6cd32b · Lu Fang · GitHub · ec79b67c · 8d6cd32b · 8d6cd32b
Unverified Commit 8d6cd32b authored Mar 05, 2025 by Lu Fang Committed by GitHub Mar 05, 2025
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

vllm/v1/engine/processor.py vllm/v1/engine/processor.py +5 -3

vllm/v1/worker/gpu_input_batch.py vllm/v1/worker/gpu_input_batch.py +7 -1

No files found.
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -92,10 +92,12 @@ class Processor:
            return
        if params.allowed_token_ids is None:
            return
-        if not all(0 <= tid < self.model_config.vocab_size
+        if not params.allowed_token_ids:
-                   for tid in params.allowed_token_ids):
+            raise ValueError("allowed_token_ids is not None and empty!")
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
            raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id")
+                "allowed_token_ids contains out-of-vocab token id!")
    def process_inputs(
        self,

--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -199,6 +199,8 @@ class InputBatch:
        self.logit_bias: list[Optional[dict[int,
                                            float]]] = [None] * max_num_reqs
        self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
@@ -300,6 +302,7 @@ class InputBatch:
            self.has_allowed_token_ids.add(req_id)
            if self.allowed_token_ids_mask_cpu_tensor is None:
                # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
                                                          self.vocab_size,
                                                          dtype=torch.bool,
@@ -309,8 +312,10 @@ class InputBatch:
                    self.vocab_size,
                    dtype=torch.bool,
                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
            self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = True
+                sampling_params.allowed_token_ids] = False
        # Add request lora ID
        if request.lora_request:
@@ -359,6 +364,7 @@ class InputBatch:
        self.logit_bias[req_index] = None
        self.has_allowed_token_ids.discard(req_id)
        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
        return req_index