Unverified Commit 5340a30d authored by Alex Brooks's avatar Alex Brooks Committed by GitHub
Browse files

Fix Max Token ID for Qwen-VL-Chat (#11980)


Signed-off-by: default avatarAlex-Brooks <Alex.brooks@ibm.com>
parent 89ce62a3
import contextlib
import os import os
import warnings import warnings
from pathlib import Path from pathlib import Path
...@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: ...@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
tokenizer.all_special_tokens_extended) tokenizer.all_special_tokens_extended)
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
tokenizer_len = len(tokenizer) tokenizer_len = len(tokenizer)
max_token_id = max(tokenizer.get_vocab().values()) max_token_id = max(tokenizer.get_vocab().values())
# Some tokenizers (e.g., QwenTokenizer) have special tokens that
# are added and included in the implementation of the vocab_size
# property, but not in get_vocab(); if there is an implementation
# of vocab size, we should take the greater value.
if hasattr(tokenizer, "vocab_size"):
with contextlib.suppress(NotImplementedError):
max_token_id = max(max_token_id, tokenizer.vocab_size)
class CachedTokenizer(tokenizer.__class__): # type: ignore class CachedTokenizer(tokenizer.__class__): # type: ignore
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment