Unverified Commit 2bcbae70 authored by Travis Johnson's avatar Travis Johnson Committed by GitHub
Browse files

[Bugfix] Fix edge-case crash when using chat with the Mistral Tekken Tokenizer (#10051)


Signed-off-by: default avatarTravis Johnson <tsjohnso@us.ibm.com>
parent ffc0f2b4
...@@ -10,19 +10,22 @@ from ...utils import check_logprobs_close ...@@ -10,19 +10,22 @@ from ...utils import check_logprobs_close
MODELS = [ MODELS = [
"mistralai/Mistral-7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mistral-7B-Instruct-v0.3",
# Mistral-Nemo is to big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
] ]
MISTRAL_FORMAT_MODELS = [ MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3", "mistralai/Mistral-7B-Instruct-v0.3",
# uses the v3-Tekken tokenizer
"mistralai/Ministral-8B-Instruct-2410",
# Mistral-Nemo is to big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
] ]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
SYMBOLIC_LANG_PROMPTS = [ SYMBOLIC_LANG_PROMPTS = [
"勇敢な船乗りについての詩を書く", # japanese "勇敢な船乗りについての詩を書く", # japanese
"寫一首關於勇敢的水手的詩", # chinese "寫一首關於勇敢的水手的詩", # chinese
"ပုံပြင်လေးပြောပြပါ်:\n", # burmese
"Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625
] ]
# for function calling # for function calling
......
...@@ -254,7 +254,7 @@ class MistralTokenizer: ...@@ -254,7 +254,7 @@ class MistralTokenizer:
skip_special_tokens: bool = True) -> str: skip_special_tokens: bool = True) -> str:
assert ( assert (
skip_special_tokens skip_special_tokens
), "Skipping special tokens is not supported for Mistral tokenizers." ), "skip_special_tokens=False is not supported for Mistral tokenizers."
if isinstance(ids, int): if isinstance(ids, int):
ids = [ids] ids = [ids]
...@@ -268,12 +268,16 @@ class MistralTokenizer: ...@@ -268,12 +268,16 @@ class MistralTokenizer:
# TODO(Patrick) - potentially allow special tokens to not be skipped # TODO(Patrick) - potentially allow special tokens to not be skipped
assert ( assert (
skip_special_tokens skip_special_tokens
), "Skipping special tokens is not supported for Mistral tokenizers." ), "skip_special_tokens=False is not supported for Mistral tokenizers."
assert isinstance(self.tokenizer, assert isinstance(self.tokenizer,
(Tekkenizer, SentencePieceTokenizer)), type( (Tekkenizer, SentencePieceTokenizer)), type(
self.tokenizer) self.tokenizer)
if isinstance(self.tokenizer, Tekkenizer):
# skip special tokens
ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
tokens = [self.tokenizer.id_to_piece(id) for id in ids] tokens = [self.tokenizer.id_to_piece(id) for id in ids]
if any("�" in t for t in tokens): if any("�" in t for t in tokens):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment