Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
be9474bd
Unverified
Commit
be9474bd
authored
Apr 04, 2022
by
SaulLu
Committed by
GitHub
Apr 04, 2022
Browse files
add a test checking the format of `convert_tokens_to_string`'s output (#16540)
* add new tests * add comment to overridden tests
parent
24a85cca
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
53 additions
and
0 deletions
+53
-0
tests/byt5/test_tokenization_byt5.py
tests/byt5/test_tokenization_byt5.py
+11
-0
tests/perceiver/test_tokenization_perceiver.py
tests/perceiver/test_tokenization_perceiver.py
+11
-0
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+9
-0
tests/wav2vec2/test_tokenization_wav2vec2.py
tests/wav2vec2/test_tokenization_wav2vec2.py
+11
-0
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
+11
-0
No files found.
tests/byt5/test_tokenization_byt5.py
View file @
be9474bd
...
...
@@ -321,3 +321,14 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
def
test_conversion_reversible
(
self
):
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings
# and special added tokens as tokens
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"t"
,
"h"
,
"i"
,
"s"
,
" "
,
"i"
,
"s"
,
" "
,
"a"
,
" "
,
"t"
,
"e"
,
"x"
,
"t"
,
"</s>"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
tests/perceiver/test_tokenization_perceiver.py
View file @
be9474bd
...
...
@@ -286,3 +286,14 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
def
test_conversion_reversible
(
self
):
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests uses invalid tokens for Perceiver that can only accept one-character
# strings and special added tokens as tokens
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"[CLS]"
,
"t"
,
"h"
,
"i"
,
"s"
,
" "
,
"i"
,
"s"
,
" "
,
"a"
,
" "
,
"t"
,
"e"
,
"s"
,
"t"
,
"[SEP]"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
tests/test_tokenization_common.py
View file @
be9474bd
...
...
@@ -3713,6 +3713,15 @@ class TokenizerTesterMixin:
trainer
.
save_model
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
))
self
.
assertIn
(
"tokenizer.json"
,
os
.
listdir
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
)))
def
test_convert_tokens_to_string_format
(
self
):
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"this"
,
"is"
,
"a"
,
"test"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
def
test_save_slow_from_fast_and_reload_fast
(
self
):
if
not
self
.
test_slow_tokenizer
or
not
self
.
test_rust_tokenizer
:
# we need both slow and fast versions
...
...
tests/wav2vec2/test_tokenization_wav2vec2.py
View file @
be9474bd
...
...
@@ -753,3 +753,14 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
def
test_torch_encode_plus_sent_to_model
(
self
):
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
# is not the case for Wav2vec2.
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"T"
,
"H"
,
"I"
,
"S"
,
"|"
,
"I"
,
"S"
,
"|"
,
"A"
,
"|"
,
"T"
,
"E"
,
"X"
,
"T"
]
output
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
output
[
"text"
],
str
)
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
View file @
be9474bd
...
...
@@ -398,3 +398,14 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
def
test_torch_encode_plus_sent_to_model
(
self
):
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
# is not the case for Wav2Vec2PhonemeCTCTokenizer.
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"ð"
,
"ɪ"
,
"s"
,
"ɪ"
,
"z"
,
"ɐ"
,
"t"
,
"ɛ"
,
"k"
,
"s"
,
"t"
]
output
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
output
[
"text"
],
str
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment