Unverified Commit 691a3ec0 authored by Guillaume Calmettes's avatar Guillaume Calmettes Committed by GitHub
Browse files

[Bugfix] Ensure special tokens are properly filtered out for guided structured...


[Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (#10363)
Signed-off-by: default avatarGuillaume Calmettes <gcalmettes@scaleway.com>
parent 3a763ba0
......@@ -17,7 +17,7 @@ pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.6
lm-format-enforcer >= 0.10.9, < 0.11
outlines >= 0.0.43, < 0.1
typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
......
......@@ -174,18 +174,29 @@ class MistralTokenizer:
revision=revision)
return tokenizer_file
# the following attributes are set to fit VLLM's design
# the following attributes are set to fit VLLM's design and are used
# by the guided structured output backends.
@property
def all_special_tokens_extended(self) -> List[str]:
return []
# tekken defines its own extended special tokens list
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
special_tokens = self.tokenizer.SPECIAL_TOKENS
else:
special_tokens = list(SpecialTokens)
return [
s.value if isinstance(s, SpecialTokens) else s
for s in special_tokens
]
@property
def all_special_tokens(self) -> List[str]:
return []
return self.all_special_tokens_extended
@property
def all_special_ids(self) -> List[int]:
return []
return [
self.all_special_tokens.index(t) for t in self.all_special_tokens
]
@property
def bos_token_id(self) -> int:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment