Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0535e5fe
Unverified
Commit
0535e5fe
authored
Nov 08, 2024
by
Patrick von Platen
Committed by
GitHub
Nov 08, 2024
Browse files
Fix edge case Mistral tokenizer (#10152)
parent
b489fc3c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
9 deletions
+9
-9
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+9
-9
No files found.
vllm/transformers_utils/tokenizers/mistral.py
View file @
0535e5fe
...
@@ -72,11 +72,12 @@ class MistralTokenizer:
...
@@ -72,11 +72,12 @@ class MistralTokenizer:
self
.
instruct
=
tokenizer
.
instruct_tokenizer
self
.
instruct
=
tokenizer
.
instruct_tokenizer
tokenizer_
=
tokenizer
.
instruct_tokenizer
.
tokenizer
tokenizer_
=
tokenizer
.
instruct_tokenizer
.
tokenizer
if
isinstance
(
tokenizer_
,
Tekkenizer
):
self
.
is_tekken
=
isinstance
(
tokenizer_
,
Tekkenizer
)
self
.
is_spm
=
isinstance
(
tokenizer_
,
SentencePieceTokenizer
)
if
self
.
is_tekken
:
# Make sure special tokens will not raise
# Make sure special tokens will not raise
tokenizer_
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
tokenizer_
.
special_token_policy
=
SpecialTokenPolicy
.
IGNORE
elif
self
.
is_spm
:
elif
isinstance
(
tokenizer_
,
SentencePieceTokenizer
):
pass
pass
else
:
else
:
raise
TypeError
(
f
"Unsupported tokenizer:
{
type
(
tokenizer_
)
}
"
)
raise
TypeError
(
f
"Unsupported tokenizer:
{
type
(
tokenizer_
)
}
"
)
...
@@ -218,7 +219,7 @@ class MistralTokenizer:
...
@@ -218,7 +219,7 @@ class MistralTokenizer:
return
encoded
.
tokens
return
encoded
.
tokens
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
])
->
str
:
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
)
:
if
self
.
is_tekken
:
tokens
=
[
tokens
=
[
t
for
t
in
tokens
t
for
t
in
tokens
if
t
not
in
self
.
tokenizer
.
_all_special_tokens
if
t
not
in
self
.
tokenizer
.
_all_special_tokens
...
@@ -270,21 +271,20 @@ class MistralTokenizer:
...
@@ -270,21 +271,20 @@ class MistralTokenizer:
skip_special_tokens
skip_special_tokens
),
"skip_special_tokens=False is not supported for Mistral tokenizers."
),
"skip_special_tokens=False is not supported for Mistral tokenizers."
assert
isinstance
(
self
.
tokenizer
,
assert
self
.
is_tekken
or
self
.
is_spm
,
type
(
self
.
tokenizer
)
(
Tekkenizer
,
SentencePieceTokenizer
)),
type
(
self
.
tokenizer
)
if
isinstance
(
self
.
tokenizer
,
Tekkenizer
)
:
if
self
.
is_tekken
:
# skip special tokens
# skip special tokens
ids
=
[
i
for
i
in
ids
if
i
>
self
.
tokenizer
.
num_special_tokens
]
ids
=
[
i
for
i
in
ids
if
i
>
self
.
tokenizer
.
num_special_tokens
]
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
tokens
=
[
self
.
tokenizer
.
id_to_piece
(
id
)
for
id
in
ids
]
if
any
(
"�"
in
t
for
t
in
tokens
):
if
any
(
"�"
in
t
for
t
in
tokens
)
and
self
.
is_tekken
:
# if a decoded token contains the replacement character, then the
# if a decoded token contains the replacement character, then the
# token has an incomplete UTF-8 character so we must use bytes
# token has an incomplete UTF-8 character so we must use bytes
# See: https://github.com/vllm-project/vllm/pull/8640
# See: https://github.com/vllm-project/vllm/pull/8640
# https://github.com/vllm-project/vllm/pull/9625
# https://github.com/vllm-project/vllm/pull/9625
# if underlying tokenizeir is sentencepiece, we just add "�"
tokens
=
[
self
.
tokenizer
.
id_to_byte_piece
(
id
)
for
id
in
ids
]
tokens
=
[
self
.
tokenizer
.
id_to_byte_piece
(
id
)
for
id
in
ids
]
return
tokens
return
tokens
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment