[Bugfix] Fix crash with llama 3.2 vision models and guided decoding (#9631)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: pavlo-ruban <pavlo.ruban@servicenow.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>

[Bugfix] Fix crash with llama 3.2 vision models and guided decoding (#9631)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: pavlo-ruban <pavlo.ruban@servicenow.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com>
6567e137 · Travis Johnson · GitHub · 228cfbd0 · 6567e137
Unverified Commit 6567e137 authored Oct 25, 2024 by Travis Johnson Committed by GitHub Oct 25, 2024
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 3 deletions

vllm/model_executor/guided_decoding/outlines_logits_processors.py ...el_executor/guided_decoding/outlines_logits_processors.py +11 -3

No files found.
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -15,11 +15,11 @@
 # limitations under the License.
 import copy
 import json
-import math
 from collections import defaultdict
 from functools import lru_cache
 from typing import Callable, DefaultDict, Dict, List, Union

+import numpy as np
 import torch
 from lark import Lark
 from outlines import grammars
@@ -77,9 +77,17 @@ class BaseLogitsProcessor:
                f"Unsupported instruction type {type(instruction)}")

        mask = torch.full((scores.shape[-1], ),
-                          -math.inf,
+                          -torch.inf,
                          device=scores.device)
-        mask[allowed_tokens] = 0
+        # The tokenizer may support more token ids than the model can generate,
+        # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256
+        # but scores.shape == torch.Size([128256])
+        # Using NumPy is faster for filtering token ids
+        allowed_tokens = np.array(allowed_tokens, dtype=np.int64)
+        allowed_tokens = torch.tensor(allowed_tokens, device=scores.device)
+        allowed_tokens = allowed_tokens.masked_select(
+            allowed_tokens < scores.shape[-1])
+        mask.index_fill_(0, allowed_tokens, 0)
        scores.add_(mask)
        return scores