Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
2229ebe7
Unverified
Commit
2229ebe7
authored
Aug 01, 2024
by
Ita Zaporozhets
Committed by
GitHub
Aug 01, 2024
Browse files
update clean_up_tokenization_spaces warning (#32371)
parent
05c1f9af
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
46 deletions
+8
-46
src/transformers/tokenization_utils_base.py
src/transformers/tokenization_utils_base.py
+8
-0
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+0
-46
No files found.
src/transformers/tokenization_utils_base.py
View file @
2229ebe7
...
...
@@ -1593,6 +1593,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self
.
model_input_names
=
kwargs
.
pop
(
"model_input_names"
,
self
.
model_input_names
)
if
"clean_up_tokenization_spaces"
not
in
kwargs
:
warnings
.
warn
(
"`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This "
"behavior will be depracted in transformers v4.45, and will be then set to `False` by default. "
"For more details check this issue: https://github.com/huggingface/transformers/issues/31884"
,
FutureWarning
,
)
# By default, cleaning tokenization spaces for both fast and slow tokenizers
self
.
clean_up_tokenization_spaces
=
kwargs
.
pop
(
"clean_up_tokenization_spaces"
,
True
)
...
...
tests/test_tokenization_common.py
View file @
2229ebe7
...
...
@@ -4247,52 +4247,6 @@ class TokenizerTesterMixin:
# Should not raise an error
self
.
rust_tokenizer_class
.
from_pretrained
(
tmp_dir_2
)
# TODO This is ran for all models but only tests bert...
def
test_clean_up_tokenization_spaces
(
self
):
tokenizer
=
BertTokenizer
.
from_pretrained
(
"google-bert/bert-base-uncased"
)
assert
tokenizer
.
clean_up_tokenization_spaces
is
True
tokens
=
tokenizer
.
encode
(
"This shouldn't be! He'll go."
)
decoded
=
tokenizer
.
decode
(
tokens
)
assert
decoded
==
"[CLS] this shouldn't be! he'll go. [SEP]"
tokenizer
.
clean_up_tokenization_spaces
=
False
decoded
=
tokenizer
.
decode
(
tokens
)
assert
decoded
==
"[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
assert
decoded
==
tokenizer
.
decode
(
tokens
,
clean_up_tokenization_spaces
=
False
)
# Fast from slow
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir_2
:
tokenizer
.
save_pretrained
(
tmp_dir_2
)
tokenizer_fast
=
BertTokenizerFast
.
from_pretrained
(
tmp_dir_2
)
del
tokenizer
assert
tokenizer_fast
.
clean_up_tokenization_spaces
is
False
decoded
=
tokenizer_fast
.
decode
(
tokens
)
# fast and slow don't have the same output when we don't cleanup
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
assert
decoded
==
"[CLS] this shouldn ' t be! he ' ll go. [SEP]"
tokenizer_fast
.
clean_up_tokenization_spaces
=
True
assert
tokenizer_fast
.
clean_up_tokenization_spaces
is
True
decoded
=
tokenizer_fast
.
decode
(
tokens
)
assert
decoded
==
"[CLS] this shouldn't be! he'll go. [SEP]"
# Slow from fast
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir_2
:
tokenizer_fast
.
clean_up_tokenization_spaces
=
False
tokenizer_fast
.
save_pretrained
(
tmp_dir_2
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
tmp_dir_2
)
assert
tokenizer
.
clean_up_tokenization_spaces
is
False
decoded
=
tokenizer
.
decode
(
tokens
)
assert
decoded
==
"[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
tokenizer
.
clean_up_tokenization_spaces
=
True
decoded
=
tokenizer
.
decode
(
tokens
)
assert
decoded
==
"[CLS] this shouldn't be! he'll go. [SEP]"
def
test_split_special_tokens
(
self
):
if
not
self
.
test_slow_tokenizer
:
self
.
skipTest
(
reason
=
"test_slow_tokenizer is set to False"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment