Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
54cacf00
Unverified
Commit
54cacf00
authored
Jan 18, 2025
by
Kunshang Ji
Committed by
GitHub
Jan 17, 2025
Browse files
[Bugfix] Mistral tokenizer encode accept list of str (#12149)
Signed-off-by:
Kunshang Ji
<
kunshang.ji@intel.com
>
parent
58fd57ff
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
8 deletions
+30
-8
vllm/transformers_utils/tokenizers/mistral.py
vllm/transformers_utils/tokenizers/mistral.py
+30
-8
No files found.
vllm/transformers_utils/tokenizers/mistral.py
View file @
54cacf00
...
...
@@ -18,6 +18,7 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy,
Tekkenizer
)
from
vllm.logger
import
init_logger
from
vllm.utils
import
is_list_of
if
TYPE_CHECKING
:
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
...
...
@@ -27,7 +28,7 @@ logger = init_logger(__name__)
@
dataclass
class
Encoding
:
input_ids
:
List
[
int
]
input_ids
:
Union
[
List
[
int
]
,
List
[
List
[
int
]]]
def
maybe_serialize_tool_calls
(
request
:
ChatCompletionRequest
):
...
...
@@ -223,17 +224,25 @@ class MistralTokenizer:
def
__call__
(
self
,
prompt
:
str
,
prompt
:
Union
[
str
,
List
[
str
],
List
[
int
]]
,
add_special_tokens
:
bool
=
False
,
truncation
:
bool
=
False
,
max_length
:
Optional
[
int
]
=
None
,
):
# Mistral Tokenizers should not add special tokens
input_ids
=
self
.
encode
(
prompt
)
if
truncation
:
input_ids
=
input_ids
[:
max_length
]
input_ids
:
Union
[
List
[
int
],
List
[
List
[
int
]]]
# For List[str], original prompt text
if
is_list_of
(
prompt
,
str
):
input_ids_
:
List
[
List
[
int
]]
=
[]
for
p
in
prompt
:
each_input_ids
=
self
.
encode_one
(
p
,
truncation
,
max_length
)
input_ids_
.
append
(
each_input_ids
)
input_ids
=
input_ids_
# For List[int], apply chat template output, already tokens.
elif
is_list_of
(
prompt
,
int
):
input_ids
=
prompt
# For str, single prompt text
else
:
input_ids
=
self
.
encode_one
(
prompt
,
truncation
,
max_length
)
return
Encoding
(
input_ids
=
input_ids
)
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]:
...
...
@@ -245,6 +254,19 @@ class MistralTokenizer:
# Mistral tokenizers have no added vocabulary
return
{}
def
encode_one
(
self
,
prompt
:
str
,
truncation
:
bool
=
False
,
max_length
:
Optional
[
int
]
=
None
,
)
->
List
[
int
]:
# Mistral Tokenizers should not add special tokens
input_ids
=
self
.
encode
(
prompt
)
if
truncation
:
input_ids
=
input_ids
[:
max_length
]
return
input_ids
def
encode
(
self
,
prompt
:
str
)
->
List
[
int
]:
# `encode` should only be used for prompt completion
# it should never be used for chat_completion.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment