Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
dd54a4b0
Unverified
Commit
dd54a4b0
authored
Sep 14, 2023
by
Antoni Baum
Committed by
GitHub
Sep 14, 2023
Browse files
Fix detokenization leaving special tokens (#1044)
Signed-off-by:
Antoni Baum
<
antoni.baum@protonmail.com
>
parent
eda1a7ca
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
14 additions
and
7 deletions
+14
-7
tests/engine/test_detokenize.py
tests/engine/test_detokenize.py
+11
-4
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+3
-3
No files found.
tests/engine/test_detokenize.py
View file @
dd54a4b0
...
@@ -23,7 +23,8 @@ TOKENIZERS = [
...
@@ -23,7 +23,8 @@ TOKENIZERS = [
]
]
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
):
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
:
bool
):
decoded_text
=
""
decoded_text
=
""
offset
=
0
offset
=
0
token_offset
=
0
token_offset
=
0
...
@@ -35,7 +36,7 @@ def _run_incremental_decode(tokenizer, all_input_ids):
...
@@ -35,7 +36,7 @@ def _run_incremental_decode(tokenizer, all_input_ids):
prev_tokens
,
prev_tokens
,
offset
,
offset
,
token_offset
,
token_offset
,
skip_special_tokens
=
False
)
skip_special_tokens
=
skip_special_tokens
)
decoded_text
+=
text
decoded_text
+=
text
if
prev_tokens
is
None
:
if
prev_tokens
is
None
:
prev_tokens
=
new_tokens
prev_tokens
=
new_tokens
...
@@ -46,10 +47,16 @@ def _run_incremental_decode(tokenizer, all_input_ids):
...
@@ -46,10 +47,16 @@ def _run_incremental_decode(tokenizer, all_input_ids):
@
pytest
.
mark
.
parametrize
(
"truth"
,
TRUTH
)
@
pytest
.
mark
.
parametrize
(
"truth"
,
TRUTH
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_id"
,
TOKENIZERS
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_id"
,
TOKENIZERS
)
def
test_decode_streaming
(
tokenizer_id
,
truth
):
@
pytest
.
mark
.
parametrize
(
"skip_special_tokens"
,
(
True
,
False
))
def
test_decode_streaming
(
tokenizer_id
,
truth
,
skip_special_tokens
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_id
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_id
)
all_input_ids
=
tokenizer
(
truth
,
add_special_tokens
=
False
)[
"input_ids"
]
all_input_ids
=
tokenizer
(
truth
,
add_special_tokens
=
False
)[
"input_ids"
]
if
skip_special_tokens
:
all_input_ids
=
([
tokenizer
.
bos_token_id
]
if
tokenizer
.
bos_token_id
is
not
None
else
[])
+
all_input_ids
+
[
tokenizer
.
eos_token_id
]
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
)
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
=
skip_special_tokens
)
assert
decoded_text
==
truth
assert
decoded_text
==
truth
vllm/transformers_utils/tokenizer.py
View file @
dd54a4b0
...
@@ -119,9 +119,9 @@ def detokenize_incrementally(
...
@@ -119,9 +119,9 @@ def detokenize_incrementally(
prefix_offset
=
max
(
len
(
output_tokens
)
-
6
,
0
)
prefix_offset
=
max
(
len
(
output_tokens
)
-
6
,
0
)
read_offset
=
max
(
len
(
output_tokens
)
-
1
,
0
)
read_offset
=
max
(
len
(
output_tokens
)
-
1
,
0
)
else
:
else
:
new_token
=
tokenizer
.
convert_ids_to_tokens
(
# Put
new_token
_id in a list so skip_special_tokens is respected
new_token
_id
,
skip_special_tokens
=
skip_special
_tokens
)
new_token
s
=
tokenizer
.
convert_ids_to
_tokens
(
new_token
s
=
[
new
_token
]
[
new_token
_id
],
skip_special_tokens
=
skip_special
_token
s
)
output_tokens
=
prev_tokens
+
new_tokens
output_tokens
=
prev_tokens
+
new_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
# The prefix text is necessary only to defeat cleanup algorithms in
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment