Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
be9474bd
Unverified
Commit
be9474bd
authored
Apr 04, 2022
by
SaulLu
Committed by
GitHub
Apr 04, 2022
Browse files
add a test checking the format of `convert_tokens_to_string`'s output (#16540)
* add new tests * add comment to overridden tests
parent
24a85cca
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
53 additions
and
0 deletions
+53
-0
tests/byt5/test_tokenization_byt5.py
tests/byt5/test_tokenization_byt5.py
+11
-0
tests/perceiver/test_tokenization_perceiver.py
tests/perceiver/test_tokenization_perceiver.py
+11
-0
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+9
-0
tests/wav2vec2/test_tokenization_wav2vec2.py
tests/wav2vec2/test_tokenization_wav2vec2.py
+11
-0
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
+11
-0
No files found.
tests/byt5/test_tokenization_byt5.py
View file @
be9474bd
...
@@ -321,3 +321,14 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -321,3 +321,14 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
def
test_conversion_reversible
(
self
):
def
test_conversion_reversible
(
self
):
pass
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests uses invalid tokens for ByT5 that can only accept one-character strings
# and special added tokens as tokens
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"t"
,
"h"
,
"i"
,
"s"
,
" "
,
"i"
,
"s"
,
" "
,
"a"
,
" "
,
"t"
,
"e"
,
"x"
,
"t"
,
"</s>"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
tests/perceiver/test_tokenization_perceiver.py
View file @
be9474bd
...
@@ -286,3 +286,14 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -286,3 +286,14 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
# tests all ids in vocab => vocab doesn't exist so unnecessary to test
def
test_conversion_reversible
(
self
):
def
test_conversion_reversible
(
self
):
pass
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests uses invalid tokens for Perceiver that can only accept one-character
# strings and special added tokens as tokens
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"[CLS]"
,
"t"
,
"h"
,
"i"
,
"s"
,
" "
,
"i"
,
"s"
,
" "
,
"a"
,
" "
,
"t"
,
"e"
,
"s"
,
"t"
,
"[SEP]"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
tests/test_tokenization_common.py
View file @
be9474bd
...
@@ -3713,6 +3713,15 @@ class TokenizerTesterMixin:
...
@@ -3713,6 +3713,15 @@ class TokenizerTesterMixin:
trainer
.
save_model
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
))
trainer
.
save_model
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
))
self
.
assertIn
(
"tokenizer.json"
,
os
.
listdir
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
)))
self
.
assertIn
(
"tokenizer.json"
,
os
.
listdir
(
os
.
path
.
join
(
tmp_dir
,
"checkpoint"
)))
def
test_convert_tokens_to_string_format
(
self
):
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"this"
,
"is"
,
"a"
,
"test"
]
string
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
string
,
str
)
def
test_save_slow_from_fast_and_reload_fast
(
self
):
def
test_save_slow_from_fast_and_reload_fast
(
self
):
if
not
self
.
test_slow_tokenizer
or
not
self
.
test_rust_tokenizer
:
if
not
self
.
test_slow_tokenizer
or
not
self
.
test_rust_tokenizer
:
# we need both slow and fast versions
# we need both slow and fast versions
...
...
tests/wav2vec2/test_tokenization_wav2vec2.py
View file @
be9474bd
...
@@ -753,3 +753,14 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -753,3 +753,14 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
def
test_torch_encode_plus_sent_to_model
(
self
):
def
test_torch_encode_plus_sent_to_model
(
self
):
pass
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
# is not the case for Wav2vec2.
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"T"
,
"H"
,
"I"
,
"S"
,
"|"
,
"I"
,
"S"
,
"|"
,
"A"
,
"|"
,
"T"
,
"E"
,
"X"
,
"T"
]
output
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
output
[
"text"
],
str
)
tests/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
View file @
be9474bd
...
@@ -398,3 +398,14 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -398,3 +398,14 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
@
unittest
.
skip
(
"The tokenizer shouldn't be used to encode input IDs (except for labels), only to decode."
)
def
test_torch_encode_plus_sent_to_model
(
self
):
def
test_torch_encode_plus_sent_to_model
(
self
):
pass
pass
def
test_convert_tokens_to_string_format
(
self
):
# The default common tokenizer tests assumes that the output of `convert_tokens_to_string` is a string which
# is not the case for Wav2Vec2PhonemeCTCTokenizer.
tokenizers
=
self
.
get_tokenizers
(
fast
=
True
,
do_lower_case
=
True
)
for
tokenizer
in
tokenizers
:
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
tokens
=
[
"ð"
,
"ɪ"
,
"s"
,
"ɪ"
,
"z"
,
"ɐ"
,
"t"
,
"ɛ"
,
"k"
,
"s"
,
"t"
]
output
=
tokenizer
.
convert_tokens_to_string
(
tokens
)
self
.
assertIsInstance
(
output
[
"text"
],
str
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment