Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
dd54a4b0
Unverified
Commit
dd54a4b0
authored
Sep 14, 2023
by
Antoni Baum
Committed by
GitHub
Sep 14, 2023
Browse files
Fix detokenization leaving special tokens (#1044)
Signed-off-by:
Antoni Baum
<
antoni.baum@protonmail.com
>
parent
eda1a7ca
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
7 deletions
+14
-7
tests/engine/test_detokenize.py
tests/engine/test_detokenize.py
+11
-4
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+3
-3
No files found.
tests/engine/test_detokenize.py
View file @
dd54a4b0
...
...
@@ -23,7 +23,8 @@ TOKENIZERS = [
]
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
):
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
:
bool
):
decoded_text
=
""
offset
=
0
token_offset
=
0
...
...
@@ -35,7 +36,7 @@ def _run_incremental_decode(tokenizer, all_input_ids):
prev_tokens
,
offset
,
token_offset
,
skip_special_tokens
=
False
)
skip_special_tokens
=
skip_special_tokens
)
decoded_text
+=
text
if
prev_tokens
is
None
:
prev_tokens
=
new_tokens
...
...
@@ -46,10 +47,16 @@ def _run_incremental_decode(tokenizer, all_input_ids):
@
pytest
.
mark
.
parametrize
(
"truth"
,
TRUTH
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_id"
,
TOKENIZERS
)
def
test_decode_streaming
(
tokenizer_id
,
truth
):
@
pytest
.
mark
.
parametrize
(
"skip_special_tokens"
,
(
True
,
False
))
def
test_decode_streaming
(
tokenizer_id
,
truth
,
skip_special_tokens
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_id
)
all_input_ids
=
tokenizer
(
truth
,
add_special_tokens
=
False
)[
"input_ids"
]
if
skip_special_tokens
:
all_input_ids
=
([
tokenizer
.
bos_token_id
]
if
tokenizer
.
bos_token_id
is
not
None
else
[])
+
all_input_ids
+
[
tokenizer
.
eos_token_id
]
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
)
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
=
skip_special_tokens
)
assert
decoded_text
==
truth
vllm/transformers_utils/tokenizer.py
View file @
dd54a4b0
...
...
@@ -119,9 +119,9 @@ def detokenize_incrementally(
prefix_offset
=
max
(
len
(
output_tokens
)
-
6
,
0
)
read_offset
=
max
(
len
(
output_tokens
)
-
1
,
0
)
else
:
new_token
=
tokenizer
.
convert_ids_to_tokens
(
new_token
_id
,
skip_special_tokens
=
skip_special
_tokens
)
new_token
s
=
[
new
_token
]
# Put
new_token
_id in a list so skip_special_tokens is respected
new_token
s
=
tokenizer
.
convert_ids_to
_tokens
(
[
new_token
_id
],
skip_special_tokens
=
skip_special
_token
s
)
output_tokens
=
prev_tokens
+
new_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment