Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
00204f2b
Commit
00204f2b
authored
Dec 22, 2019
by
Aymeric Augustin
Browse files
Replace CommonTestCases for tokenizers with a mixin.
This is the same change as for (TF)CommonTestCases for modeling.
parent
a3c5883f
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
452 additions
and
451 deletions
+452
-451
templates/adding_a_new_model/tests/test_tokenization_xxx.py
templates/adding_a_new_model/tests/test_tokenization_xxx.py
+3
-2
tests/test_configuration_common.py
tests/test_configuration_common.py
+1
-1
tests/test_model_card.py
tests/test_model_card.py
+1
-1
tests/test_optimization.py
tests/test_optimization.py
+1
-1
tests/test_tokenization_albert.py
tests/test_tokenization_albert.py
+3
-2
tests/test_tokenization_bert.py
tests/test_tokenization_bert.py
+3
-2
tests/test_tokenization_bert_japanese.py
tests/test_tokenization_bert_japanese.py
+4
-3
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+412
-423
tests/test_tokenization_ctrl.py
tests/test_tokenization_ctrl.py
+3
-2
tests/test_tokenization_gpt2.py
tests/test_tokenization_gpt2.py
+3
-2
tests/test_tokenization_openai.py
tests/test_tokenization_openai.py
+3
-2
tests/test_tokenization_roberta.py
tests/test_tokenization_roberta.py
+3
-2
tests/test_tokenization_t5.py
tests/test_tokenization_t5.py
+3
-2
tests/test_tokenization_transfo_xl.py
tests/test_tokenization_transfo_xl.py
+3
-2
tests/test_tokenization_xlm.py
tests/test_tokenization_xlm.py
+3
-2
tests/test_tokenization_xlnet.py
tests/test_tokenization_xlnet.py
+3
-2
No files found.
templates/adding_a_new_model/tests/test_tokenization_xxx.py
View file @
00204f2b
...
...
@@ -15,14 +15,15 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
VOCAB_FILES_NAMES
,
XxxTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
XxxTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XxxTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XxxTokenizer
...
...
tests/test_configuration_common.py
View file @
00204f2b
...
...
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
import
json
import
os
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ConfigTester
(
object
):
...
...
tests/test_model_card.py
View file @
00204f2b
...
...
@@ -20,7 +20,7 @@ import unittest
from
transformers.modelcard
import
ModelCard
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ModelCardTester
(
unittest
.
TestCase
):
...
...
tests/test_optimization.py
View file @
00204f2b
...
...
@@ -19,7 +19,7 @@ import unittest
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
from
.utils
import
require_torch
...
...
tests/test_tokenization_albert.py
View file @
00204f2b
...
...
@@ -15,16 +15,17 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
AlbertTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/spiece.model"
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
AlbertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
AlbertTokenizer
...
...
tests/test_tokenization_bert.py
View file @
00204f2b
...
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
(
...
...
@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
_is_whitespace
,
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
BertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertTokenizer
...
...
tests/test_tokenization_bert_japanese.py
View file @
00204f2b
...
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
...
...
@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
MecabTokenizer
,
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
custom_tokenizers
,
slow
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
...
...
@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseCharacterTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
...
...
tests/test_tokenization_common.py
View file @
00204f2b
...
...
@@ -18,7 +18,6 @@ import os
import
shutil
import
sys
import
tempfile
import
unittest
from
io
import
open
...
...
@@ -43,8 +42,7 @@ else:
unicode
=
str
class
CommonTestCases
:
class
CommonTokenizerTester
(
unittest
.
TestCase
):
class
TokenizerTesterMixin
:
tokenizer_class
=
None
...
...
@@ -305,11 +303,7 @@ class CommonTestCases:
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
...
...
@@ -332,8 +326,7 @@ class CommonTestCases:
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
],
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
],
)
information
=
tokenizer
.
encode_plus
(
...
...
@@ -440,9 +433,7 @@ class CommonTestCases:
tokenizer
.
padding_side
=
"right"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
...
...
@@ -451,9 +442,7 @@ class CommonTestCases:
tokenizer
.
padding_side
=
"left"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
...
...
tests/test_tokenization_ctrl.py
View file @
00204f2b
...
...
@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_ctrl
import
VOCAB_FILES_NAMES
,
CTRLTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
CTRLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
CTRLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
CTRLTokenizer
...
...
tests/test_tokenization_gpt2.py
View file @
00204f2b
...
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_gpt2
import
VOCAB_FILES_NAMES
,
GPT2Tokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
GPT2TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
GPT2TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
GPT2Tokenizer
...
...
tests/test_tokenization_openai.py
View file @
00204f2b
...
...
@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
transformers.tokenization_openai
import
VOCAB_FILES_NAMES
,
OpenAIGPTTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
OpenAIGPTTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
OpenAIGPTTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
OpenAIGPTTokenizer
...
...
tests/test_tokenization_roberta.py
View file @
00204f2b
...
...
@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_roberta
import
VOCAB_FILES_NAMES
,
RobertaTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
RobertaTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
RobertaTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
RobertaTokenizer
def
setUp
(
self
):
...
...
tests/test_tokenization_t5.py
View file @
00204f2b
...
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_t5
import
T5Tokenizer
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
T5TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
T5TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
T5Tokenizer
...
...
tests/test_tokenization_transfo_xl.py
View file @
00204f2b
...
...
@@ -15,11 +15,12 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
require_torch
...
...
@@ -28,7 +29,7 @@ if is_torch_available():
@
require_torch
class
TransfoXLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
TransfoXLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
TransfoXLTokenizer
if
is_torch_available
()
else
None
...
...
tests/test_tokenization_xlm.py
View file @
00204f2b
...
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
XLMTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLMTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLMTokenizer
...
...
tests/test_tokenization_xlnet.py
View file @
00204f2b
...
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
XLNetTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLNetTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLNetTokenizer
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment