Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c3248cf1
"vscode:/vscode.git/clone" did not exist on "72087f8a178eff6b1890616705f6021cabd8f072"
Commit
c3248cf1
authored
Dec 11, 2019
by
LysandreJik
Committed by
Lysandre Debut
Dec 13, 2019
Browse files
Tests for all tokenizers
parent
f2ac50cb
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
9 additions
and
28 deletions
+9
-28
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+0
-13
transformers/tests/tokenization_gpt2_test.py
transformers/tests/tokenization_gpt2_test.py
+0
-15
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+9
-0
No files found.
transformers/tests/tokenization_bert_test.py
View file @
c3248cf1
...
@@ -99,19 +99,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -99,19 +99,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"unwantedX running"
),
[
"[UNK]"
,
"runn"
,
"##ing"
])
tokenizer
.
tokenize
(
"unwantedX running"
),
[
"[UNK]"
,
"runn"
,
"##ing"
])
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
'[ABC]'
,
'[DEF]'
,
'GHI IHG'
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]"
encoded
=
tokenizer
.
encode
(
input
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
.
lower
(),
(
"[CLS] "
+
input
+
" [SEP]"
).
lower
()
)
def
test_is_whitespace
(
self
):
def
test_is_whitespace
(
self
):
self
.
assertTrue
(
_is_whitespace
(
u
" "
))
self
.
assertTrue
(
_is_whitespace
(
u
" "
))
self
.
assertTrue
(
_is_whitespace
(
u
"
\t
"
))
self
.
assertTrue
(
_is_whitespace
(
u
"
\t
"
))
...
...
transformers/tests/tokenization_gpt2_test.py
View file @
c3248cf1
...
@@ -67,20 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -67,20 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
self
.
assertListEqual
(
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
'[ABC]'
,
'[DEF]'
,
'GHI IHG'
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower [DEF]"
encoded
=
tokenizer
.
encode
(
input
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
.
lower
(),
input
.
lower
()
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
transformers/tests/tokenization_tests_commons.py
View file @
c3248cf1
...
@@ -232,6 +232,15 @@ class CommonTestCases:
...
@@ -232,6 +232,15 @@ class CommonTestCases:
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
'[ABC]'
,
'[DEF]'
,
'GHI IHG'
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
,
input
)
def
test_pretrained_model_lists
(
self
):
def
test_pretrained_model_lists
(
self
):
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment