Unverified Commit 691a3ec0 authored by Guillaume Calmettes's avatar Guillaume Calmettes Committed by GitHub
Browse files

[Bugfix] Ensure special tokens are properly filtered out for guided structured...


[Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (#10363)
Signed-off-by: default avatarGuillaume Calmettes <gcalmettes@scaleway.com>
parent 3a763ba0
...@@ -17,7 +17,7 @@ pillow # Required for image processing ...@@ -17,7 +17,7 @@ pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.10.6 lm-format-enforcer >= 0.10.9, < 0.11
outlines >= 0.0.43, < 0.1 outlines >= 0.0.43, < 0.1
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
...@@ -31,4 +31,4 @@ pyyaml ...@@ -31,4 +31,4 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.8.0 # required for compressed-tensors compressed-tensors == 0.8.0 # required for compressed-tensors
\ No newline at end of file
...@@ -174,18 +174,29 @@ class MistralTokenizer: ...@@ -174,18 +174,29 @@ class MistralTokenizer:
revision=revision) revision=revision)
return tokenizer_file return tokenizer_file
# the following attributes are set to fit VLLM's design # the following attributes are set to fit VLLM's design and are used
# by the guided structured output backends.
@property @property
def all_special_tokens_extended(self) -> List[str]: def all_special_tokens_extended(self) -> List[str]:
return [] # tekken defines its own extended special tokens list
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
special_tokens = self.tokenizer.SPECIAL_TOKENS
else:
special_tokens = list(SpecialTokens)
return [
s.value if isinstance(s, SpecialTokens) else s
for s in special_tokens
]
@property @property
def all_special_tokens(self) -> List[str]: def all_special_tokens(self) -> List[str]:
return [] return self.all_special_tokens_extended
@property @property
def all_special_ids(self) -> List[int]: def all_special_ids(self) -> List[int]:
return [] return [
self.all_special_tokens.index(t) for t in self.all_special_tokens
]
@property @property
def bos_token_id(self) -> int: def bos_token_id(self) -> int:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment