Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
00204f2b
Commit
00204f2b
authored
Dec 22, 2019
by
Aymeric Augustin
Browse files
Replace CommonTestCases for tokenizers with a mixin.
This is the same change as for (TF)CommonTestCases for modeling.
parent
a3c5883f
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
452 additions
and
451 deletions
+452
-451
templates/adding_a_new_model/tests/test_tokenization_xxx.py
templates/adding_a_new_model/tests/test_tokenization_xxx.py
+3
-2
tests/test_configuration_common.py
tests/test_configuration_common.py
+1
-1
tests/test_model_card.py
tests/test_model_card.py
+1
-1
tests/test_optimization.py
tests/test_optimization.py
+1
-1
tests/test_tokenization_albert.py
tests/test_tokenization_albert.py
+3
-2
tests/test_tokenization_bert.py
tests/test_tokenization_bert.py
+3
-2
tests/test_tokenization_bert_japanese.py
tests/test_tokenization_bert_japanese.py
+4
-3
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+412
-423
tests/test_tokenization_ctrl.py
tests/test_tokenization_ctrl.py
+3
-2
tests/test_tokenization_gpt2.py
tests/test_tokenization_gpt2.py
+3
-2
tests/test_tokenization_openai.py
tests/test_tokenization_openai.py
+3
-2
tests/test_tokenization_roberta.py
tests/test_tokenization_roberta.py
+3
-2
tests/test_tokenization_t5.py
tests/test_tokenization_t5.py
+3
-2
tests/test_tokenization_transfo_xl.py
tests/test_tokenization_transfo_xl.py
+3
-2
tests/test_tokenization_xlm.py
tests/test_tokenization_xlm.py
+3
-2
tests/test_tokenization_xlnet.py
tests/test_tokenization_xlnet.py
+3
-2
No files found.
templates/adding_a_new_model/tests/test_tokenization_xxx.py
View file @
00204f2b
...
@@ -15,14 +15,15 @@
...
@@ -15,14 +15,15 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_bert
import
VOCAB_FILES_NAMES
,
XxxTokenizer
from
transformers.tokenization_bert
import
VOCAB_FILES_NAMES
,
XxxTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
XxxTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XxxTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XxxTokenizer
tokenizer_class
=
XxxTokenizer
...
...
tests/test_configuration_common.py
View file @
00204f2b
...
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
...
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
import
json
import
json
import
os
import
os
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ConfigTester
(
object
):
class
ConfigTester
(
object
):
...
...
tests/test_model_card.py
View file @
00204f2b
...
@@ -20,7 +20,7 @@ import unittest
...
@@ -20,7 +20,7 @@ import unittest
from
transformers.modelcard
import
ModelCard
from
transformers.modelcard
import
ModelCard
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
class
ModelCardTester
(
unittest
.
TestCase
):
class
ModelCardTester
(
unittest
.
TestCase
):
...
...
tests/test_optimization.py
View file @
00204f2b
...
@@ -19,7 +19,7 @@ import unittest
...
@@ -19,7 +19,7 @@ import unittest
from
transformers
import
is_torch_available
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
TemporaryDirectory
from
.test_tokenization_commo
n
import
TemporaryDirectory
from
.utils
import
require_torch
from
.utils
import
require_torch
...
...
tests/test_tokenization_albert.py
View file @
00204f2b
...
@@ -15,16 +15,17 @@
...
@@ -15,16 +15,17 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
transformers.tokenization_albert
import
AlbertTokenizer
from
transformers.tokenization_albert
import
AlbertTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/spiece.model"
)
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/spiece.model"
)
class
AlbertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
AlbertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
AlbertTokenizer
tokenizer_class
=
AlbertTokenizer
...
...
tests/test_tokenization_bert.py
View file @
00204f2b
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_bert
import
(
from
transformers.tokenization_bert
import
(
...
@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
...
@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
_is_whitespace
,
_is_whitespace
,
)
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
class
BertTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertTokenizer
tokenizer_class
=
BertTokenizer
...
...
tests/test_tokenization_bert_japanese.py
View file @
00204f2b
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
from
transformers.tokenization_bert
import
WordpieceTokenizer
...
@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
...
@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
MecabTokenizer
,
MecabTokenizer
,
)
)
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
custom_tokenizers
,
slow
from
.utils
import
custom_tokenizers
,
slow
@
custom_tokenizers
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
tokenizer_class
=
BertJapaneseTokenizer
...
@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
...
@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
BertJapaneseCharacterTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertJapaneseTokenizer
tokenizer_class
=
BertJapaneseTokenizer
...
...
tests/test_tokenization_common.py
View file @
00204f2b
...
@@ -18,7 +18,6 @@ import os
...
@@ -18,7 +18,6 @@ import os
import
shutil
import
shutil
import
sys
import
sys
import
tempfile
import
tempfile
import
unittest
from
io
import
open
from
io
import
open
...
@@ -43,489 +42,479 @@ else:
...
@@ -43,489 +42,479 @@ else:
unicode
=
str
unicode
=
str
class
CommonTestCases
:
class
TokenizerTesterMixin
:
class
CommonTokenizerTester
(
unittest
.
TestCase
):
tokenizer_class
=
None
tokenizer_class
=
None
def
setUp
(
self
):
def
setUp
(
self
):
self
.
tmpdirname
=
tempfile
.
mkdtemp
()
self
.
tmpdirname
=
tempfile
.
mkdtemp
()
def
tearDown
(
self
):
def
tearDown
(
self
):
shutil
.
rmtree
(
self
.
tmpdirname
)
shutil
.
rmtree
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
def
get_tokenizer
(
self
,
**
kwargs
):
raise
NotImplementedError
raise
NotImplementedError
def
get_input_output_texts
(
self
):
def
get_input_output_texts
(
self
):
raise
NotImplementedError
raise
NotImplementedError
def
test_tokenizers_common_properties
(
self
):
def
test_tokenizers_common_properties
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
attributes_list
=
[
attributes_list
=
[
"bos_token"
,
"bos_token"
,
"eos_token"
,
"eos_token"
,
"unk_token"
,
"unk_token"
,
"sep_token"
,
"sep_token"
,
"pad_token"
,
"pad_token"
,
"cls_token"
,
"cls_token"
,
"mask_token"
,
"mask_token"
,
]
]
for
attr
in
attributes_list
:
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
+
"_id"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
+
"_id"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens_ids"
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
"additional_special_tokens_ids"
))
attributes_list
=
[
"max_len"
,
"init_inputs"
,
"init_kwargs"
,
"added_tokens_encoder"
,
"added_tokens_decoder"
]
attributes_list
=
[
"max_len"
,
"init_inputs"
,
"init_kwargs"
,
"added_tokens_encoder"
,
"added_tokens_decoder"
]
for
attr
in
attributes_list
:
for
attr
in
attributes_list
:
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
self
.
assertTrue
(
hasattr
(
tokenizer
,
attr
))
def
test_save_and_load_tokenizer
(
self
):
def
test_save_and_load_tokenizer
(
self
):
# safety check on max_len default value so we are sure the test works
# safety check on max_len default value so we are sure the test works
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
# Now let's start the test
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
before_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
after_tokens
=
tokenizer
.
encode
(
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
def
test_pickle_tokenizer
(
self
):
def
test_pickle_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsNotNone
(
tokenizer
)
text
=
"Munich and Berlin are nice cities"
text
=
"Munich and Berlin are nice cities"
subwords
=
tokenizer
.
tokenize
(
text
)
subwords
=
tokenizer
.
tokenize
(
text
)
with
TemporaryDirectory
()
as
tmpdirname
:
with
TemporaryDirectory
()
as
tmpdirname
:
filename
=
os
.
path
.
join
(
tmpdirname
,
"tokenizer.bin"
)
filename
=
os
.
path
.
join
(
tmpdirname
,
"tokenizer.bin"
)
with
open
(
filename
,
"wb"
)
as
handle
:
with
open
(
filename
,
"wb"
)
as
handle
:
pickle
.
dump
(
tokenizer
,
handle
)
pickle
.
dump
(
tokenizer
,
handle
)
with
open
(
filename
,
"rb"
)
as
handle
:
with
open
(
filename
,
"rb"
)
as
handle
:
tokenizer_new
=
pickle
.
load
(
handle
)
tokenizer_new
=
pickle
.
load
(
handle
)
subwords_loaded
=
tokenizer_new
.
tokenize
(
text
)
subwords_loaded
=
tokenizer_new
.
tokenize
(
text
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
self
.
assertListEqual
(
subwords
,
subwords_loaded
)
def
test_added_tokens_do_lower_case
(
self
):
def
test_added_tokens_do_lower_case
(
self
):
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
special_token
=
tokenizer
.
all_special_tokens
[
0
]
special_token
=
tokenizer
.
all_special_tokens
[
0
]
text
=
special_token
+
" aaaaa bbbbbb low cccccccccdddddddd l "
+
special_token
text
=
special_token
+
" aaaaa bbbbbb low cccccccccdddddddd l "
+
special_token
text2
=
special_token
+
" AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l "
+
special_token
text2
=
special_token
+
" AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l "
+
special_token
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
toks0
=
tokenizer
.
tokenize
(
text
)
# toks before adding new_toks
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
"AAAAA BBBBBB"
,
"CCCCCCCCCDDDDDDDD"
]
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
,
"AAAAA BBBBBB"
,
"CCCCCCCCCDDDDDDDD"
]
added
=
tokenizer
.
add_tokens
(
new_toks
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
2
)
self
.
assertEqual
(
added
,
2
)
toks
=
tokenizer
.
tokenize
(
text
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
# toks0 should be longer
self
.
assertListEqual
(
toks
,
toks2
)
self
.
assertListEqual
(
toks
,
toks2
)
# Check that none of the special tokens are lowercased
# Check that none of the special tokens are lowercased
sequence_with_special_tokens
=
"A "
+
" yEs "
.
join
(
tokenizer
.
all_special_tokens
)
+
" B"
sequence_with_special_tokens
=
"A "
+
" yEs "
.
join
(
tokenizer
.
all_special_tokens
)
+
" B"
tokenized_sequence
=
tokenizer
.
tokenize
(
sequence_with_special_tokens
)
tokenized_sequence
=
tokenizer
.
tokenize
(
sequence_with_special_tokens
)
for
special_token
in
tokenizer
.
all_special_tokens
:
for
special_token
in
tokenizer
.
all_special_tokens
:
self
.
assertTrue
(
special_token
in
tokenized_sequence
)
self
.
assertTrue
(
special_token
in
tokenized_sequence
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
False
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
added
=
tokenizer
.
add_tokens
(
new_toks
)
self
.
assertEqual
(
added
,
4
)
self
.
assertEqual
(
added
,
4
)
toks
=
tokenizer
.
tokenize
(
text
)
toks
=
tokenizer
.
tokenize
(
text
)
toks2
=
tokenizer
.
tokenize
(
text2
)
toks2
=
tokenizer
.
tokenize
(
text2
)
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertEqual
(
len
(
toks
),
len
(
toks2
))
# Length should still be the same
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
len
(
toks
),
len
(
toks0
))
self
.
assertNotEqual
(
toks
[
1
],
toks2
[
1
])
# But at least the first non-special tokens should differ
self
.
assertNotEqual
(
toks
[
1
],
toks2
[
1
])
# But at least the first non-special tokens should differ
def
test_add_tokens_tokenizer
(
self
):
def
test_add_tokens_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
all_size
=
len
(
tokenizer
)
all_size
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size
,
0
)
self
.
assertNotEqual
(
vocab_size
,
0
)
self
.
assertEqual
(
vocab_size
,
all_size
)
self
.
assertEqual
(
vocab_size
,
all_size
)
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
]
new_toks
=
[
"aaaaa bbbbbb"
,
"cccccccccdddddddd"
]
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
vocab_size_2
=
tokenizer
.
vocab_size
vocab_size_2
=
tokenizer
.
vocab_size
all_size_2
=
len
(
tokenizer
)
all_size_2
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_2
,
0
)
self
.
assertNotEqual
(
vocab_size_2
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_2
)
self
.
assertEqual
(
vocab_size
,
vocab_size_2
)
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
new_toks_2
=
{
"eos_token"
:
">>>>|||<||<<|<<"
,
"pad_token"
:
"<<<<<|||>|>>>>|>"
}
new_toks_2
=
{
"eos_token"
:
">>>>|||<||<<|<<"
,
"pad_token"
:
"<<<<<|||>|>>>>|>"
}
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
vocab_size_3
=
tokenizer
.
vocab_size
vocab_size_3
=
tokenizer
.
vocab_size
all_size_3
=
len
(
tokenizer
)
all_size_3
=
len
(
tokenizer
)
self
.
assertNotEqual
(
vocab_size_3
,
0
)
self
.
assertNotEqual
(
vocab_size_3
,
0
)
self
.
assertEqual
(
vocab_size
,
vocab_size_3
)
self
.
assertEqual
(
vocab_size
,
vocab_size_3
)
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
)
out_string
=
tokenizer
.
decode
(
tokens
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
self
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
self
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
self
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
0
],
tokenizer
.
eos_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
self
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
pad_token_id
)
def
test_add_special_tokens
(
self
):
def
test_add_special_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
special_token
=
"[SPECIAL TOKEN]"
special_token
=
"[SPECIAL TOKEN]"
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
tokenizer
.
add_special_tokens
({
"cls_token"
:
special_token
})
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
encoded_special_token
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
len
(
encoded_special_token
)
==
1
assert
len
(
encoded_special_token
)
==
1
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
text
=
" "
.
join
([
input_text
,
special_token
,
output_text
])
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
encoded
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
input_encoded
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
output_encoded
=
tokenizer
.
encode
(
output_text
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
special_token_id
=
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
)
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
assert
encoded
==
input_encoded
+
special_token_id
+
output_encoded
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
decoded
=
tokenizer
.
decode
(
encoded
,
skip_special_tokens
=
True
)
assert
special_token
not
in
decoded
assert
special_token
not
in
decoded
def
test_required_methods_tokenizer
(
self
):
def
test_required_methods_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
input_text
,
output_text
=
self
.
get_input_output_texts
()
tokens
=
tokenizer
.
tokenize
(
input_text
)
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
text_2
=
tokenizer
.
decode
(
ids
)
text_2
=
tokenizer
.
decode
(
ids
)
self
.
assertEqual
(
text_2
,
output_text
)
self
.
assertEqual
(
text_2
,
output_text
)
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertNotEqual
(
len
(
tokens_2
),
0
)
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
self
.
assertIsInstance
(
text_2
,
(
str
,
unicode
))
def
test_encode_decode_with_spaces
(
self
):
def
test_encode_decode_with_spaces
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
new_toks
=
[
"[ABC]"
,
"[DEF]"
,
"GHI IHG"
]
new_toks
=
[
"[ABC]"
,
"[DEF]"
,
"GHI IHG"
]
tokenizer
.
add_tokens
(
new_toks
)
tokenizer
.
add_tokens
(
new_toks
)
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
input
=
"[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
encoded
=
tokenizer
.
encode
(
input
,
add_special_tokens
=
False
)
decoded
=
tokenizer
.
decode
(
encoded
)
decoded
=
tokenizer
.
decode
(
encoded
)
self
.
assertEqual
(
decoded
,
input
)
self
.
assertEqual
(
decoded
,
input
)
def
test_pretrained_model_lists
(
self
):
def
test_pretrained_model_lists
(
self
):
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_list
=
list
(
self
.
tokenizer_class
.
max_model_input_sizes
.
keys
())
weights_lists_2
=
[]
weights_lists_2
=
[]
for
file_id
,
map_list
in
self
.
tokenizer_class
.
pretrained_vocab_files_map
.
items
():
for
file_id
,
map_list
in
self
.
tokenizer_class
.
pretrained_vocab_files_map
.
items
():
weights_lists_2
.
append
(
list
(
map_list
.
keys
()))
weights_lists_2
.
append
(
list
(
map_list
.
keys
()))
for
weights_list_2
in
weights_lists_2
:
for
weights_list_2
in
weights_lists_2
:
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
def
test_mask_output
(
self
):
def
test_mask_output
(
self
):
if
sys
.
version_info
<=
(
3
,
0
):
if
sys
.
version_info
<=
(
3
,
0
):
return
return
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
"."
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
self
.
assertEqual
(
len
(
sequences
),
len
(
mask
))
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
build_inputs_with_special_tokens
.
__qualname__
.
split
(
"."
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
self
.
assertEqual
(
len
(
sequences
),
len
(
mask
)
)
# Method is implemented (e.g. not GPT-2)
def
test_number_of_added_tokens
(
self
):
if
len
(
attached_sequences
)
!=
2
:
tokenizer
=
self
.
get_tokenizer
()
self
.
assertEqual
(
tokenizer
.
num_added_tokens
(
pair
=
True
),
len
(
attached_sequences
)
-
len
(
sequences
))
seq_0
=
"Test this method."
def
test_maximum_encoding_length_single_input
(
self
):
seq_1
=
"With these inputs."
tokenizer
=
self
.
get_tokenizer
()
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
seq_0
=
"This is a sentence to be encoded."
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
stride
=
2
# Method is implemented (e.g. not GPT-2)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
if
len
(
attached_sequences
)
!=
2
:
num_added_tokens
=
tokenizer
.
num_added_tokens
()
self
.
assertEqual
(
tokenizer
.
num_added_tokens
(
pair
=
True
),
len
(
attached_sequences
)
-
len
(
sequences
)
)
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
def
test_maximum_encoding_length_single_input
(
self
):
seq_0
,
tokenizer
=
self
.
get_tokenizer
()
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
seq_0
=
"This is a sentence to be encoded."
stride
=
stride
,
stride
=
2
return_overflowing_tokens
=
True
,
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
truncated_sequence
=
information
[
"input_ids"
]
total_length
=
len
(
sequence
)
+
num_added_tokens
overflowing_tokens
=
information
[
"overflowing_tokens"
]
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
return_overflowing_tokens
=
True
,
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
)
self
.
assertEqual
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
)
:])
self
.
assertEqual
(
len
(
truncated_sequence
),
total_length
-
2
)
truncated_sequence
=
information
[
"input_ids"
]
self
.
assertEqual
(
truncated_sequence
,
tokenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
]))
overflowing_tokens
=
information
[
"overflowing_tokens"
]
def
test_maximum_encoding_length_pair_input
(
self
):
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
tokenizer
=
self
.
get_tokenizer
(
)
self
.
assertEqual
(
overflowing_tokens
,
sequence
[
-
(
2
+
stride
)
:]
)
self
.
assertEqual
(
len
(
truncated_sequence
),
total_length
-
2
)
seq_0
=
"This is a sent
ence to
be encoded."
self
.
assertEqual
(
truncated_sequ
ence
,
to
kenizer
.
build_inputs_with_special_tokens
(
sequence
[:
-
2
]))
seq_1
=
"This is another sentence to be encoded."
stride
=
2
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
seq_0
=
"This is a sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
stride
=
2
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
,
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
[:
-
2
],
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
information
=
tokenizer
.
encode_plu
s
(
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_token
s
(
seq_0
,
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
,
seq_1
,
)
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
information
=
tokenizer
.
encode_plus
(
stride
=
stride
,
seq_0
,
truncation_strategy
=
"only_second"
,
seq_1
,
return_overflowing_tokens
=
True
,
max_length
=
len
(
sequence
)
-
2
,
)
add_special_tokens
=
True
,
information_first_truncated
=
tokenizer
.
encode_plus
(
stride
=
stride
,
seq_0
,
truncation_strategy
=
"only_second"
,
seq_1
,
return_overflowing_tokens
=
True
,
max_length
=
len
(
sequence
)
-
2
,
)
add_special_tokens
=
True
,
information_first_truncated
=
tokenizer
.
encode_plus
(
stride
=
stride
,
seq_0
,
truncation_strategy
=
"only_first"
,
seq_1
,
return_overflowing_tokens
=
True
,
max_length
=
len
(
sequence
)
-
2
,
)
add_special_tokens
=
True
,
stride
=
stride
,
truncat
ed_sequence
=
information
[
"input_ids"
]
truncat
ion_strategy
=
"only_first"
,
overflowing_tokens
=
information
[
"
overflowing_tokens
"
]
return_
overflowing_tokens
=
True
,
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
)
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
stride
)
truncated_sequence
=
information
[
"input_ids"
]
self
.
assertEqual
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
)
:])
overflowing_tokens
=
information
[
"overflowing_tokens"
]
self
.
assertEqual
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
)
:])
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
self
.
assertEqual
(
len
(
truncated_sequence
),
len
(
sequence
)
-
2
)
self
.
assertEqual
(
truncated_sequence
,
truncated_second_sequenc
e
)
self
.
assertEqual
(
len
(
overflowing_tokens
),
2
+
strid
e
)
self
.
assertEqual
(
overflowing_tokens
,
sequence_1_no_special_tokens
[
-
(
2
+
stride
)
:])
def
test_encode_input_type
(
self
):
self
.
assertEqual
(
overflowing_tokens_first_truncated
,
sequence_0_no_special_tokens
[
-
(
2
+
stride
)
:])
tokenizer
=
self
.
get_tokenizer
(
)
self
.
assertEqual
(
len
(
truncated_sequence
),
len
(
sequence
)
-
2
)
self
.
assertEqual
(
truncated_sequence
,
truncated_second_sequence
)
sequence
=
"Let's encode this sequence"
def
test_encode_input_type
(
self
):
token
s
=
tokenizer
.
tokenize
(
sequence
)
token
izer
=
self
.
get_tokenizer
(
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
sequence
=
"Let's encode this sequence"
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
tokens
=
tokenizer
.
tokenize
(
sequence
)
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
formatted_input
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
True
)
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
(
)
self
.
assertEqual
(
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
),
formatted_input
)
self
.
assertEqual
(
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
),
formatted_input
)
sequence_0
=
"Encode this."
sequence_1
=
"This one too please."
def
test_special_tokens_mask
(
self
):
tokenizer
=
self
.
get_tokenizer
()
# Testing single inputs
encoded_
sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
sequence
_0
=
"Encode this."
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_1
=
"This one too please."
sequence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
# Testing single inputs
encoded_sequence
_w_special
=
encode
d_
sequence_
dict
[
"input_ids"
]
encoded_sequence
=
tokenizer
.
encode
(
sequence_
0
,
add_special_tokens
=
False
)
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
se
lf
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
se
quence_0
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
filter
ed_sequence
=
[
encod
ed_sequence
_w_special
=
encoded_sequence_dict
[
"input_ids"
]
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_
w_
special
)
special_tokens_mask
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
# Testing inputs pairs
]
encod
ed_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
filter
ed_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
sequence_1
,
add_special_tokens
=
False
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
# Testing inputs pairs
sequence_0
,
sequence_
1
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
encoded_sequence
=
tokenizer
.
encode
(
sequence_
0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
)
sequence_1
,
add_special_tokens
=
False
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
)
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
se
lf
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
se
quence_0
,
sequence_1
,
add_special_tokens
=
True
,
return_special_tokens_mask
=
True
)
filter
ed_sequence
=
[
encod
ed_sequence
_w_special
=
encoded_sequence_dict
[
"input_ids"
]
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_
w_
special
)
special_tokens_mask
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
))
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
filtered_sequence
=
[
(
x
if
not
special_tokens_mask
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)
# Testing with already existing special tokens
]
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
tokenizer
.
add_special_tokens
({
"cls_token"
:
"</s>"
,
"sep_token"
:
"<s>"
}
)
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
,
return_
special
_
tokens
_mask
=
True
# Testing with already existing
special
tokens
)
if
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
and
tokenizer
.
cls_token_id
==
tokenizer
.
unk_token_id
:
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
tokenizer
.
add_special_tokens
({
"cls_token"
:
"</s>"
,
"sep_token"
:
"<s>"
})
special_tokens_mask_orig
=
encoded_sequence_dict
[
"special_tokens_mask"
]
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
s
pecial_tokens_mask
=
tokenizer
.
get
_special_tokens_mask
(
s
equence_0
,
add_special_tokens
=
True
,
return
_special_tokens_mask
=
True
encoded_sequence_w_special
,
already_has_special_tokens
=
True
)
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_
w_
special
))
special_tokens_mask
_orig
=
encoded_sequence_
dict
[
"
special
_tokens_mask"
]
self
.
assertEqual
(
special_tokens_mask
_orig
,
special_tokens_mask
)
special_tokens_mask
=
tokenizer
.
get_
special_tokens_mask
(
encoded_sequence_w_special
,
already_has_special_tokens
=
True
def
test_padding_to_max_length
(
self
):
)
tokenizer
=
self
.
get_tokenizer
(
)
self
.
assertEqual
(
len
(
special_tokens_mask
),
len
(
encoded_sequence_w_special
)
)
self
.
assertEqual
(
special_tokens_mask_orig
,
special_tokens_mask
)
sequence
=
"Sequence"
padding_size
=
10
def
test_padding_to_max_length
(
self
):
padding_idx
=
tokenizer
.
pad_token_id
tokenizer
=
self
.
get_tokenizer
()
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
sequence
=
"Sequence"
tokenizer
.
padding_si
d
e
=
"right"
padding_si
z
e
=
10
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
padding_idx
=
tokenizer
.
pad_token_id
sequence_length
=
len
(
encoded_sequence
)
padded_sequence
=
tokenizer
.
encode
(
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
tokenizer
.
padding_side
=
"right"
)
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
padded_
sequence_length
=
len
(
pad
ded_sequence
)
sequence_length
=
len
(
enco
ded_sequence
)
assert
sequence_length
+
padding_size
==
pad
ded_sequence
_length
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad
_to_max
_length
=
True
)
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to Tru
e
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequenc
e
tokenizer
.
padding_side
=
"left"
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
sequence_length
=
len
(
encoded_sequence
)
tokenizer
.
padding_side
=
"left"
pad
ded_sequence
=
tokenizer
.
encode
(
enco
ded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence
,
max
_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
sequence_length
=
len
(
encoded_sequence
)
)
padded_sequence
=
tokenizer
.
encode
(
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
sequence_length
=
len
(
encoded_sequence
)
sequence_length
=
len
(
encoded_sequence
)
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_right_length
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
assert
encoded_sequence
==
padded_sequence_right
assert
sequence_length
==
padded_sequence_left_length
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
assert
encoded_sequence
==
padded_sequence_left
def
test_encode_plus_with_padding
(
self
):
def
test_encode_plus_with_padding
(
self
):
tokenizer
=
self
.
get_tokenizer
()
tokenizer
=
self
.
get_tokenizer
()
sequence
=
"Sequence"
sequence
=
"Sequence"
padding_size
=
10
padding_size
=
10
padding_idx
=
tokenizer
.
pad_token_id
padding_idx
=
tokenizer
.
pad_token_id
token_type_padding_idx
=
tokenizer
.
pad_token_type_id
token_type_padding_idx
=
tokenizer
.
pad_token_type_id
encoded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
return_special_tokens_mask
=
True
)
encoded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
return_special_tokens_mask
=
True
)
input_ids
=
encoded_sequence
[
"input_ids"
]
input_ids
=
encoded_sequence
[
"input_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
special_tokens_mask
=
encoded_sequence
[
"special_tokens_mask"
]
special_tokens_mask
=
encoded_sequence
[
"special_tokens_mask"
]
sequence_length
=
len
(
input_ids
)
sequence_length
=
len
(
input_ids
)
# Test right padding
# Test right padding
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
padded_sequence
=
tokenizer
.
encode_plus
(
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
sequence
,
max_length
=
sequence_length
+
padding_size
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
return_special_tokens_mask
=
True
,
)
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
padded_input_ids
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
padded_input_ids
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
padded_token_type_ids
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
padded_token_type_ids
assert
attention_mask
+
[
0
]
*
padding_size
==
padded_attention_mask
assert
attention_mask
+
[
0
]
*
padding_size
==
padded_attention_mask
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
padded_special_tokens_mask
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
padded_special_tokens_mask
# Test left padding
# Test left padding
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
padded_sequence
=
tokenizer
.
encode_plus
(
padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
sequence
,
max_length
=
sequence_length
+
padding_size
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
,
pad_to_max_length
=
True
,
return_special_tokens_mask
=
True
,
return_special_tokens_mask
=
True
,
)
)
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_input_ids
=
padded_sequence
[
"input_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_token_type_ids
=
padded_sequence
[
"token_type_ids"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_attention_mask
=
padded_sequence
[
"attention_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_special_tokens_mask
=
padded_sequence
[
"special_tokens_mask"
]
padded_sequence_length
=
len
(
padded_input_ids
)
padded_sequence_length
=
len
(
padded_input_ids
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
padded_input_ids
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
padded_input_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
padded_token_type_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
padded_token_type_ids
assert
[
0
]
*
padding_size
+
attention_mask
==
padded_attention_mask
assert
[
0
]
*
padding_size
+
attention_mask
==
padded_attention_mask
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
padded_special_tokens_mask
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
padded_special_tokens_mask
tests/test_tokenization_ctrl.py
View file @
00204f2b
...
@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
json
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_ctrl
import
VOCAB_FILES_NAMES
,
CTRLTokenizer
from
transformers.tokenization_ctrl
import
VOCAB_FILES_NAMES
,
CTRLTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
CTRLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
CTRLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
CTRLTokenizer
tokenizer_class
=
CTRLTokenizer
...
...
tests/test_tokenization_gpt2.py
View file @
00204f2b
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
json
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_gpt2
import
VOCAB_FILES_NAMES
,
GPT2Tokenizer
from
transformers.tokenization_gpt2
import
VOCAB_FILES_NAMES
,
GPT2Tokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
GPT2TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
GPT2TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
GPT2Tokenizer
tokenizer_class
=
GPT2Tokenizer
...
...
tests/test_tokenization_openai.py
View file @
00204f2b
...
@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
json
import
os
import
os
import
unittest
from
transformers.tokenization_openai
import
VOCAB_FILES_NAMES
,
OpenAIGPTTokenizer
from
transformers.tokenization_openai
import
VOCAB_FILES_NAMES
,
OpenAIGPTTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
class
OpenAIGPTTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
OpenAIGPTTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
OpenAIGPTTokenizer
tokenizer_class
=
OpenAIGPTTokenizer
...
...
tests/test_tokenization_roberta.py
View file @
00204f2b
...
@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
json
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers.tokenization_roberta
import
VOCAB_FILES_NAMES
,
RobertaTokenizer
from
transformers.tokenization_roberta
import
VOCAB_FILES_NAMES
,
RobertaTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
class
RobertaTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
RobertaTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
RobertaTokenizer
tokenizer_class
=
RobertaTokenizer
def
setUp
(
self
):
def
setUp
(
self
):
...
...
tests/test_tokenization_t5.py
View file @
00204f2b
...
@@ -15,17 +15,18 @@
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
transformers.tokenization_t5
import
T5Tokenizer
from
transformers.tokenization_t5
import
T5Tokenizer
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
T5TokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
T5TokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
T5Tokenizer
tokenizer_class
=
T5Tokenizer
...
...
tests/test_tokenization_transfo_xl.py
View file @
00204f2b
...
@@ -15,11 +15,12 @@
...
@@ -15,11 +15,12 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
io
import
open
from
io
import
open
from
transformers
import
is_torch_available
from
transformers
import
is_torch_available
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
require_torch
from
.utils
import
require_torch
...
@@ -28,7 +29,7 @@ if is_torch_available():
...
@@ -28,7 +29,7 @@ if is_torch_available():
@
require_torch
@
require_torch
class
TransfoXLTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
TransfoXLTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
TransfoXLTokenizer
if
is_torch_available
()
else
None
tokenizer_class
=
TransfoXLTokenizer
if
is_torch_available
()
else
None
...
...
tests/test_tokenization_xlm.py
View file @
00204f2b
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
json
import
json
import
os
import
os
import
unittest
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
transformers.tokenization_xlm
import
VOCAB_FILES_NAMES
,
XLMTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
class
XLMTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLMTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLMTokenizer
tokenizer_class
=
XLMTokenizer
...
...
tests/test_tokenization_xlnet.py
View file @
00204f2b
...
@@ -15,17 +15,18 @@
...
@@ -15,17 +15,18 @@
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
os
import
unittest
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
,
XLNetTokenizer
from
.test_tokenization_commo
import
CommonTestCases
from
.test_tokenization_commo
n
import
TokenizerTesterMixin
from
.utils
import
slow
from
.utils
import
slow
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
class
XLNetTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
class
XLNetTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
XLNetTokenizer
tokenizer_class
=
XLNetTokenizer
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment