Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
00204f2b
Commit
00204f2b
authored
Dec 22, 2019
by
Aymeric Augustin
Browse files
Replace CommonTestCases for tokenizers with a mixin.
This is the same change as for (TF)CommonTestCases for modeling.
parent
a3c5883f
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
452 additions
and
451 deletions
+452
-451
templates/adding_a_new_model/tests/test_tokenization_xxx.py
templates/adding_a_new_model/tests/test_tokenization_xxx.py
+3
-2
tests/test_configuration_common.py
tests/test_configuration_common.py
+1
-1
tests/test_model_card.py
tests/test_model_card.py
+1
-1
tests/test_optimization.py
tests/test_optimization.py
+1
-1
tests/test_tokenization_albert.py
tests/test_tokenization_albert.py
+3
-2
tests/test_tokenization_bert.py
tests/test_tokenization_bert.py
+3
-2
tests/test_tokenization_bert_japanese.py
tests/test_tokenization_bert_japanese.py
+4
-3
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+412
-423
tests/test_tokenization_ctrl.py
tests/test_tokenization_ctrl.py
+3
-2
tests/test_tokenization_gpt2.py
tests/test_tokenization_gpt2.py
+3
-2
tests/test_tokenization_openai.py
tests/test_tokenization_openai.py
+3
-2
tests/test_tokenization_roberta.py
tests/test_tokenization_roberta.py
+3
-2
tests/test_tokenization_t5.py
tests/test_tokenization_t5.py
+3
-2
tests/test_tokenization_transfo_xl.py
tests/test_tokenization_transfo_xl.py
+3
-2
tests/test_tokenization_xlm.py
tests/test_tokenization_xlm.py
+3
-2
tests/test_tokenization_xlnet.py
tests/test_tokenization_xlnet.py
+3
-2
No files found.
templates/adding_a_new_model/tests/test_tokenization_xxx.py
View file @
00204f2b
...
...
@@ -15,14 +15,15 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
VOCAB_FILES_NAMES
,
XxxTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
XxxTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XxxTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XxxTokenizer
...
...
tests/test_configuration_common.py
View file @
00204f2b
...
...
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
import
json
import
os
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ConfigTester
(
object
):
...
...
tests/test_model_card.py
View file @
00204f2b
...
...
@@ -20,7 +20,7 @@ import unittest
from
transformers.modelcard
import
ModelCard
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ModelCardTester
(
unittest
.
TestCase
):
...
...
tests/test_optimization.py
View file @
00204f2b
...
...
@@ -19,7 +19,7 @@ import unittest
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
from
.utils
import
require_torch
...
...
tests/test_tokenization_albert.py
View file @
00204f2b
...
...
@@ -15,16 +15,17 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_albert
import
AlbertTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/spiece.model"
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
AlbertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
AlbertTokenizer
...
...
tests/test_tokenization_bert.py
View file @
00204f2b
...
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
(
...
...
@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
_is_whitespace
,
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
BertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertTokenizer
...
...
tests/test_tokenization_bert_japanese.py
View file @
00204f2b
...
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
...
...
@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
MecabTokenizer
,
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
custom_tokenizers
,
slow
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
...
...
@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseCharacterTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
...
...
tests/test_tokenization_common.py
View file @
00204f2b
...
...
@@ -18,7 +18,6 @@ import os
import
shutil
import
sys
import
tempfile
import
unittest
from
io
import
open
...
...
@@ -43,489 +42,479 @@ else:
unicode
=
str
class
CommonTestCases
:
class
CommonTokenizerTester
(
unittest
.
TestCase
):
class
TokenizerTesterMixin
:
tokenizer_class
=
None
tokenizer_class
=
None
def
setUp
(
self
):
self
.
tmpdirname
=
tempfile
.
mkdtemp
()
def
setUp
(
self
):
self
.
tmpdirname
=
tempfile
.
mkdtemp
()
def
tearDown
(
self
):
shutil
.
rmtree
(
self
.
tmpdirname
)
def
tearDown
(
self
):
shutil
.
rmtree
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
raise
NotImplementedError
def
get_tokenizer
(
self
,
**
kwargs
):
raise
NotImplementedError
def
get_input_output_texts
(
self
):
raise
NotImplementedError
def
get_input_output_texts
(
self
):
raise
NotImplementedError
def
test_tokenizers_common_properties
(
self
):
tokenizer
=
self
.
get_tokenizer
()
attributes_list
=
[
"bos_token"
,
"eos_token"
,
"unk_token"
,
"sep_token"
,
"pad_token"
,
"cls_token"
,
"mask_token"
,
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
+
"_id"
))
def
test_tokenizers_common_properties
(
self
):
tokenizer
=
self
.
get_tokenizer
()
attributes_list
=
[
"bos_token"
,
"eos_token"
,
"unk_token"
,
"sep_token"
,
"pad_token"
,
"cls_token"
,
"mask_token"
,
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
+
"_id"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens_ids"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens_ids"
))
attributes_list
=
[
"max_len"
,
"init_inputs"
,
"init_kwargs"
,
"added_tokens_encoder"
,
"added_tokens_decoder"
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
attributes_list
=
[
"max_len"
,
"init_inputs"
,
"init_kwargs"
,
"added_tokens_encoder"
,
"added_tokens_decoder"
]
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
def
test_save_and_load_tokenizer
(
self
):
# safety check on max_len default value so we are sure the test works
tokenizer
=
self
.
get_tokenizer
()
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
def
test_save_and_load_tokenizer
(
self
):
# safety check on max_len default value so we are sure the test works
tokenizer
=
self
.
get_tokenizer
()
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
before_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
after_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
def
test_pickle_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertIsNotNone
(
tokenizer
)
def
test_pickle_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertIsNotNone
(
tokenizer
)
text
=
"Munich and Berlin are nice cities"
subwords
=
tokenizer
.
tokenize
(
text
)
text
=
"Munich and Berlin are nice cities"
subwords
=
tokenizer
.
tokenize
(
text
)
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
filename
=
os
.
path
.
join
(
tmpdirname
,
"tokenizer.bin"
)
with
open
(
filename
,
"wb"
)
as
handle
:
pickle
.
dump
(
tokenizer
,
handle
)
filename
=
os
.
path
.
join
(
tmpdirname
,
"tokenizer.bin"
)
with
open
(
filename
,
"wb"
)
as
handle
:
pickle
.
dump
(
tokenizer
,
handle
)
with
open
(
filename
,
"rb"
)
as
handle
:
tokenizer_new
=
pickle
.
load
(
handle
)
with
open
(
filename
,
"rb"
)
as
handle
:
tokenizer_new
=
pickle
.
load
(
handle
)
subwords_loaded
=
tokenizer_new
.
tokenize
(
text
)
subwords_loaded
=
tokenizer_new
.
tokenize
(
text
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
def
test_added_tokens_do_lower_case
(
self
):
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
def
test_added_tokens_do_lower_case
(
self
):
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
special_token
=
tokenizer
.
all_special_tokens
[
0
]
special_token
=
tokenizer
.
all_special_tokens
[
0
]
text
=
special_token
+
" aaaaa bbbbbb low cccccccccdddddddd l "
+
special_token
text2
=
special_token
+
" AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l "
+
special_token
text
=
special_token
+
" aaaaa bbbbbb low cccccccccdddddddd l "
+
special_token
text2
=
special_token
+
" AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l "
+
special_token
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
"AAAAA BBBBBB"
,
"CCCCCCCCCDDDDDDDD"
]
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
2
)
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
"AAAAA BBBBBB"
,
"CCCCCCCCCDDDDDDDD"
]
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertListEqual
(
toks
,
toks2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertListEqual
(
toks
,
toks2
)
# Check that none of the special tokens are lowercased
sequence_with_special_tokens
=
"A "
+
" yEs "
.
join
(
tokenizer
.
all_special_tokens
)
+
" B"
tokenized_sequence
=
tokenizer
.
tokenize
(
sequence_with_special_tokens
)
# Check that none of the special tokens are lowercased
sequence_with_special_tokens
=
"A "
+
" yEs "
.
join
(
tokenizer
.
all_special_tokens
)
+
" B"
tokenized_sequence
=
tokenizer
.
tokenize
(
sequence_with_special_tokens
)
for
special_token
in
tokenizer
.
all_special_tokens
:
self
.
assertTrue
(
special_token
in
tokenized_sequence
)
for
special_token
in
tokenizer
.
all_special_tokens
:
self
.
assertTrue
(
special_token
in
tokenized_sequence
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
4
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
4
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
toks
[
1
],
toks2
[
1
])
# But at least the first non-special tokens should differ
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
toks
[
1
],
toks2
[
1
])
# But at least the first non-special tokens should differ
def
test_add_tokens_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
def
test_add_tokens_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
vocab_size
=
tokenizer
.
vocab_size
all_size
=
len
(
tokenizer
)
vocab_size
=
tokenizer
.
vocab_size
all_size
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size
,
0
)
self
.
assertEqual
(
vocab_size
,
all_size
)
self
.
assertNotEqual
(
vocab_size
,
0
)
self
.
assertEqual
(
vocab_size
,
all_size
)
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
]
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
vocab_size_2
=
tokenizer
.
vocab_size
all_size_2
=
len
(
tokenizer
)
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
]
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
vocab_size_2
=
tokenizer
.
vocab_size
all_size_2
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_2
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_2
)
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertNotEqual
(
vocab_size_2
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_2
)
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
new_toks_2
=
{
"eos_token"
:
">>>>|||<||<<|<<"
,
"pad_token"
:
"<<<<<|||>|>>>>|>"
}
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
vocab_size_3
=
tokenizer
.
vocab_size
all_size_3
=
len
(
tokenizer
)
new_toks_2
=
{
"eos_token"
:
">>>>|||<||<<|<<"
,
"pad_token"
:
"<<<<<|||>|>>>>|>"
}
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
vocab_size_3
=
tokenizer
.
vocab_size
all_size_3
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_3
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_3
)
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertNotEqual
(
vocab_size_3
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_3
)
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
special_token
=
"[SPECIAL TOKEN]"
special_token
=
"[SPECIAL TOKEN]"
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
text_2
=
tokenizer
.
decode
(
ids
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
text_2
=
tokenizer
.
decode
(
ids
)
self
.
assertEqual
(
text_2
,
output_text
)
self
.
assertEqual
(
text_2
,
output_text
)
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
"[ABC]"
,
"[DEF]"
,
"GHI IHG"
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
,
input
)
new_toks
=
[
"[ABC]"
,
"[DEF]"
,
"GHI IHG"
]
tokenizer
.
add_tokens
(
new_toks
)
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
,
input
)
def
test_pretrained_model_lists
(
self
):
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_lists_2
=
[]
for
file_id
,
map_list
in
self
.
tokenizer_class
.
pretrained_vocab_files_map
.
items
():
weights_lists_2
.
append
(
list
(
map_list
.
keys
()))
def
test_pretrained_model_lists
(
self
):
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_lists_2
=
[]
for
file_id
,
map_list
in
self
.
tokenizer_class
.
pretrained_vocab_files_map
.
items
():
weights_lists_2
.
append
(
list
(
map_list
.
keys
()))
for
weights_list_2
in
weights_lists_2
:
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
for
weights_list_2
in
weights_lists_2
:
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
def
test_mask_output
(
self
):
if
sys
.
version_info
<=
(
3
,
0
):
return
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
"."
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
self
.
assertEqual
(
len
(
sequences
),
len
(
mask
))
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
def
test_mask_output
(
self
):
if
sys
.
version_info
<=
(
3
,
0
):
return
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
"."
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
if
len
(
attached_sequences
)
!=
2
:
self
.
assertEqual
(
tokenizer
.
num_added_tokens
(
pair
=
True
),
len
(
attached_sequences
)
-
len
(
sequences
))
def
test_maximum_encoding_length_single_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"This is a sentence to be encoded."
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
self
.
assertEqual
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
total_length
-
2
)
self
.
assertEqual
(
truncated_sequence
,
tokenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
]))
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
(
)
seq_0
=
"This is a sent
ence to
be encoded."
seq_1
=
"This is another sentence to be encoded."
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
,
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
[:
-
2
],
)
information
=
tokenizer
.
encode_plu
s
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncation_strategy
=
"only_second"
,
return_overflowing_tokens
=
True
,
)
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncation_strategy
=
"only_first"
,
return_overflowing_tokens
=
True
,
)
truncat
ed_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"
overflowing_tokens
"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
self
.
assertEqual
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
len
(
sequence
)
-
2
)
self
.
assertEqual
(
truncated_sequence
,
truncated_second_sequenc
e
)
def
test_encode_input_type
(
self
):
tokenizer
=
self
.
get_tokenizer
(
)
sequence
=
"Let's encode this sequence"
token
s
=
tokenizer
.
tokenize
(
sequence
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
(
)
sequence_0
=
"Encode this."
sequence_1
=
"This one too please."
# Testing single inputs
encoded_
sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encoded_sequence
_w_special
=
encode
d_
sequence_
dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
se
lf
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filter
ed_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_
w_
special
)
]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encod
ed_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_
1
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
se
lf
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filter
ed_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_
w_
special
)
]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing with already existing special tokens
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
tokenizer
.
add_special_tokens
({
"cls_token"
:
"</s>"
,
"sep_token"
:
"<s>"
}
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
,
return_
special
_
tokens
_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask_orig
=
encoded_sequence_dict
[
"special_tokens_mask"
]
s
pecial_tokens_mask
=
tokenizer
.
get
_special_tokens_mask
(
encoded_sequence_w_special
,
already_has_special_tokens
=
True
)
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_
w_
special
))
self
.
assertEqual
(
special_tokens_mask
_orig
,
special_tokens_mask
)
def
test_padding_to_max_length
(
self
):
tokenizer
=
self
.
get_tokenizer
(
)
sequence
=
"Sequence"
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_si
d
e
=
"right"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_
sequence_length
=
len
(
pad
ded_sequence
)
assert
sequence_length
+
padding_size
==
pad
ded_sequence
_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to Tru
e
tokenizer
.
padding_side
=
"left"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
pad
ded_sequence
=
tokenizer
.
encode
(
sequence
,
max
_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
def
test_encode_plus_with_padding
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
token_type_padding_idx
=
tokenizer
.
pad_token_type_id
encoded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
return_special_tokens_mask
=
True
)
input_ids
=
encoded_sequence
[
"input_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
special_tokens_mask
=
encoded_sequence
[
"special_tokens_mask"
]
sequence_length
=
len
(
input_ids
)
# Test right padding
tokenizer
.
padding_side
=
"right"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
padded_input_ids
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
padded_token_type_ids
assert
attention_mask
+
[
0
]
*
padding_size
==
padded_attention_mask
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
padded_special_tokens_mask
# Test left padding
tokenizer
.
padding_side
=
"left"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
padded_input_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
padded_token_type_ids
assert
[
0
]
*
padding_size
+
attention_mask
==
padded_attention_mask
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
padded_special_tokens_mask
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
self
.
assertEqual
(
len
(
sequences
),
len
(
mask
)
)
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
if
len
(
attached_sequences
)
!=
2
:
self
.
assertEqual
(
tokenizer
.
num_added_tokens
(
pair
=
True
),
len
(
attached_sequences
)
-
len
(
sequences
)
)
def
test_maximum_encoding_length_single_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"This is a sentence to be encoded."
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
self
.
assertEqual
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
)
:]
)
self
.
assertEqual
(
len
(
truncated_sequence
),
total_length
-
2
)
self
.
assertEqual
(
truncated_sequ
ence
,
to
kenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
]))
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
seq_0
=
"This is a sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_token
s
(
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
,
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncation_strategy
=
"only_second"
,
return_overflowing_tokens
=
True
,
)
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
truncat
ion_strategy
=
"only_first"
,
return_
overflowing_tokens
=
True
,
)
truncated_sequence
=
information
[
"input_ids"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
strid
e
)
self
.
assertEqual
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
len
(
sequence
)
-
2
)
self
.
assertEqual
(
truncated_sequence
,
truncated_second_sequence
)
def
test_encode_input_type
(
self
):
token
izer
=
self
.
get_tokenizer
(
)
sequence
=
"Let's encode this sequence"
tokens
=
tokenizer
.
tokenize
(
sequence
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
_0
=
"Encode this."
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_
0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
se
quence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encod
ed_sequence
_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
]
filter
ed_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_
0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
se
quence_0
,
sequence_1
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
encod
ed_sequence
_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing with already existing
special
tokens
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
tokenizer
.
add_special_tokens
({
"cls_token"
:
"</s>"
,
"sep_token"
:
"<s>"
})
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
s
equence_0
,
add_special_tokens
=
True
,
return
_special_tokens_mask
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
_orig
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
special_tokens_mask
=
tokenizer
.
get_
special_tokens_mask
(
encoded_sequence_w_special
,
already_has_special_tokens
=
True
)
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
)
)
self
.
assertEqual
(
special_tokens_mask_orig
,
special_tokens_mask
)
def
test_padding_to_max_length
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
padding_si
z
e
=
10
padding_idx
=
tokenizer
.
pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"right"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
enco
ded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad
_to_max
_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequenc
e
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"left"
enco
ded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
def
test_encode_plus_with_padding
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
token_type_padding_idx
=
tokenizer
.
pad_token_type_id
encoded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
return_special_tokens_mask
=
True
)
input_ids
=
encoded_sequence
[
"input_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
special_tokens_mask
=
encoded_sequence
[
"special_tokens_mask"
]
sequence_length
=
len
(
input_ids
)
# Test right padding
tokenizer
.
padding_side
=
"right"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
padded_input_ids
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
padded_token_type_ids
assert
attention_mask
+
[
0
]
*
padding_size
==
padded_attention_mask
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
padded_special_tokens_mask
# Test left padding
tokenizer
.
padding_side
=
"left"
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
padded_input_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
padded_token_type_ids
assert
[
0
]
*
padding_size
+
attention_mask
==
padded_attention_mask
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
padded_special_tokens_mask
tests/test_tokenization_ctrl.py
View file @
00204f2b
...
...
@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_ctrl
import
VOCAB_FILES_NAMES
,
CTRLTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
CTRLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
CTRLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
CTRLTokenizer
...
...
tests/test_tokenization_gpt2.py
View file @
00204f2b
...
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_gpt2
import
VOCAB_FILES_NAMES
,
GPT2Tokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
GPT2TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
GPT2TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
GPT2Tokenizer
...
...
tests/test_tokenization_openai.py
View file @
00204f2b
...
...
@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
transformers.tokenization_openai
import
VOCAB_FILES_NAMES
,
OpenAIGPTTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
OpenAIGPTTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
OpenAIGPTTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
OpenAIGPTTokenizer
...
...
tests/test_tokenization_roberta.py
View file @
00204f2b
...
...
@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_roberta
import
VOCAB_FILES_NAMES
,
RobertaTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
RobertaTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
RobertaTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
RobertaTokenizer
def
setUp
(
self
):
...
...
tests/test_tokenization_t5.py
View file @
00204f2b
...
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_t5
import
T5Tokenizer
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
T5TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
T5TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
T5Tokenizer
...
...
tests/test_tokenization_transfo_xl.py
View file @
00204f2b
...
...
@@ -15,11 +15,12 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
require_torch
...
...
@@ -28,7 +29,7 @@ if is_torch_available():
@
require_torch
class
TransfoXLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
TransfoXLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
TransfoXLTokenizer
if
is_torch_available
()
else
None
...
...
tests/test_tokenization_xlm.py
View file @
00204f2b
...
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
os
import
unittest
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
class
XLMTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLMTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLMTokenizer
...
...
tests/test_tokenization_xlnet.py
View file @
00204f2b
...
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
XLNetTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLNetTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLNetTokenizer
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment