Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
25b8b8a6
Unverified
Commit
25b8b8a6
authored
Jan 06, 2022
by
AK391
Committed by
GitHub
Jan 06, 2022
Browse files
Merge branch 'huggingface:master' into master
parents
23801367
b67f345d
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
52 deletions
+57
-52
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+57
-52
No files found.
tests/test_tokenization_common.py
View file @
25b8b8a6
...
...
@@ -893,7 +893,7 @@ class TokenizerTesterMixin:
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
total_length
=
len
(
sequence
)
assert
total_length
>
4
,
"Issue with the testing sequence, please update it it's too short"
self
.
assert
Greater
(
total_length
,
4
,
"Issue with the testing sequence, please update it it's too short"
)
# Test with max model input length
model_max_length
=
tokenizer
.
model_max_length
...
...
@@ -902,9 +902,9 @@ class TokenizerTesterMixin:
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
total_length1
=
len
(
sequence1
[
"input_ids"
])
assert
(
total_length1
>
model_max_length
)
,
"Issue with the testing sequence, please update it it's too short"
self
.
assert
Greater
(
total_length1
,
model_max_length
,
"Issue with the testing sequence, please update it it's too short"
)
# Simple
padding_strategies
=
(
...
...
@@ -989,7 +989,7 @@ class TokenizerTesterMixin:
ids
=
None
seq0_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
assert
len
(
seq0_tokens
)
>
2
+
stride
self
.
assert
Greater
(
len
(
seq0_tokens
)
,
2
+
stride
)
seq_1
=
"This is another sentence to be encoded."
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
...
...
@@ -998,7 +998,7 @@ class TokenizerTesterMixin:
seq_1
=
tokenizer
.
decode
(
seq1_tokens
,
clean_up_tokenization_spaces
=
False
)
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
assert
len
(
seq1_tokens
)
>
2
+
stride
self
.
assert
Greater
(
len
(
seq1_tokens
)
,
2
+
stride
)
smallest
=
seq1_tokens
if
len
(
seq0_tokens
)
>
len
(
seq1_tokens
)
else
seq0_tokens
...
...
@@ -1010,14 +1010,18 @@ class TokenizerTesterMixin:
model_max_length
=
tokenizer
.
model_max_length
self
.
assertEqual
(
model_max_length
,
100
)
seq_2
=
seq_0
*
model_max_length
assert
len
(
seq_2
)
>
model_max_length
self
.
assert
Greater
(
len
(
seq_2
)
,
model_max_length
)
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
total_length1
=
len
(
sequence1
[
"input_ids"
])
sequence2
=
tokenizer
(
seq_2
,
seq_1
,
add_special_tokens
=
False
)
total_length2
=
len
(
sequence2
[
"input_ids"
])
assert
total_length1
<
model_max_length
-
10
,
"Issue with the testing sequence, please update it."
assert
total_length2
>
model_max_length
,
"Issue with the testing sequence, please update it."
self
.
assertLess
(
total_length1
,
model_max_length
-
10
,
"Issue with the testing sequence, please update it."
)
self
.
assertGreater
(
total_length2
,
model_max_length
,
"Issue with the testing sequence, please update it."
)
# Simple
padding_strategies
=
(
...
...
@@ -1279,7 +1283,7 @@ class TokenizerTesterMixin:
# # Test first masked sequence
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
# assert
len(encoded_masked)
==
len(encoded_0)
#
self.
assert
Equal(
len(encoded_masked)
,
len(encoded_0)
)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc]
...
...
@@ -1288,7 +1292,7 @@ class TokenizerTesterMixin:
# # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# assert
len(encoded_masked)
==
len(encoded_1)
#
self.
assert
Equal(
len(encoded_masked)
,
len(encoded_1)
)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc]
...
...
@@ -1356,8 +1360,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
self
.
assert
Equal
(
encoded_sequence
+
[
padding_idx
]
*
padding_size
,
padded_sequence
)
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"left"
...
...
@@ -1367,8 +1371,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
self
.
assert
Equal
(
[
padding_idx
]
*
padding_size
+
encoded_sequence
,
padded_sequence
)
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
...
...
@@ -1377,26 +1381,26 @@ class TokenizerTesterMixin:
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
padding
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
"longest"
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
self
.
assert
Equal
(
sequence_length
,
padded_sequence_left_length
)
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_left
)
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
False
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_left_length
assert
encoded_sequence
==
padded_sequence_left
self
.
assert
Equal
(
sequence_length
,
padded_sequence_left_length
)
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_left
)
def
test_right_and_left_truncation
(
self
):
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
...
...
@@ -1478,8 +1482,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
self
.
assert
Equal
(
encoded_sequence
+
[
padding_idx
]
*
padding_size
,
padded_sequence
)
# Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
...
...
@@ -1488,8 +1492,8 @@ class TokenizerTesterMixin:
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
def
test_padding_to_multiple_of
(
self
):
tokenizers
=
self
.
get_tokenizers
()
...
...
@@ -1575,9 +1579,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
assert
sequence_length
==
not_padded_sequence_length
assert
input_ids
==
not_padded_input_ids
assert
special_tokens_mask
==
not_padded_special_tokens_mask
self
.
assert
Equal
(
sequence_length
,
not_padded_sequence_length
)
self
.
assert
Equal
(
input_ids
,
not_padded_input_ids
)
self
.
assert
Equal
(
special_tokens_mask
,
not_padded_special_tokens_mask
)
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
...
...
@@ -1589,9 +1593,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
assert
sequence_length
==
not_padded_sequence_length
assert
input_ids
==
not_padded_input_ids
assert
special_tokens_mask
==
not_padded_special_tokens_mask
self
.
assert
Equal
(
sequence_length
,
not_padded_sequence_length
)
self
.
assert
Equal
(
input_ids
,
not_padded_input_ids
)
self
.
assert
Equal
(
special_tokens_mask
,
not_padded_special_tokens_mask
)
# Test right padding
tokenizer
.
padding_side
=
"right"
...
...
@@ -1607,9 +1611,9 @@ class TokenizerTesterMixin:
right_padded_special_tokens_mask
=
right_padded_sequence
[
"special_tokens_mask"
]
right_padded_sequence_length
=
len
(
right_padded_input_ids
)
assert
sequence_length
+
padding_size
==
right_padded_sequence_length
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
right_padded_input_ids
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
right_padded_special_tokens_mask
self
.
assert
Equal
(
sequence_length
+
padding_size
,
right_padded_sequence_length
)
self
.
assert
Equal
(
input_ids
+
[
padding_idx
]
*
padding_size
,
right_padded_input_ids
)
self
.
assert
Equal
(
special_tokens_mask
+
[
1
]
*
padding_size
,
right_padded_special_tokens_mask
)
# Test left padding
tokenizer
.
padding_side
=
"left"
...
...
@@ -1623,25 +1627,29 @@ class TokenizerTesterMixin:
left_padded_special_tokens_mask
=
left_padded_sequence
[
"special_tokens_mask"
]
left_padded_sequence_length
=
len
(
left_padded_input_ids
)
assert
sequence_length
+
padding_size
==
left_padded_sequence_length
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
left_padded_input_ids
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
left_padded_special_tokens_mask
self
.
assert
Equal
(
sequence_length
+
padding_size
,
left_padded_sequence_length
)
self
.
assert
Equal
(
[
padding_idx
]
*
padding_size
+
input_ids
,
left_padded_input_ids
)
self
.
assert
Equal
(
[
1
]
*
padding_size
+
special_tokens_mask
,
left_padded_special_tokens_mask
)
if
"token_type_ids"
in
tokenizer
.
model_input_names
:
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
left_padded_token_type_ids
=
left_padded_sequence
[
"token_type_ids"
]
right_padded_token_type_ids
=
right_padded_sequence
[
"token_type_ids"
]
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
right_padded_token_type_ids
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
left_padded_token_type_ids
self
.
assertEqual
(
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
,
right_padded_token_type_ids
)
self
.
assertEqual
(
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
,
left_padded_token_type_ids
)
if
"attention_mask"
in
tokenizer
.
model_input_names
:
attention_mask
=
encoded_sequence
[
"attention_mask"
]
right_padded_attention_mask
=
right_padded_sequence
[
"attention_mask"
]
left_padded_attention_mask
=
left_padded_sequence
[
"attention_mask"
]
assert
attention_mask
+
[
0
]
*
padding_size
==
right_padded_attention_mask
assert
[
0
]
*
padding_size
+
attention_mask
==
left_padded_attention_mask
self
.
assert
Equal
(
attention_mask
+
[
0
]
*
padding_size
,
right_padded_attention_mask
)
self
.
assert
Equal
(
[
0
]
*
padding_size
+
attention_mask
,
left_padded_attention_mask
)
def
test_separate_tokenizers
(
self
):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
...
...
@@ -1652,9 +1660,9 @@ class TokenizerTesterMixin:
for
tokenizer
,
new_tokenizer
in
zip
(
tokenizers
,
new_tokenizers
):
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
assert
tokenizer
.
init_kwargs
[
"random_argument"
]
is
True
assert
tokenizer
.
init_kwargs
[
"random_argument"
]
is
True
assert
new_tokenizer
.
init_kwargs
[
"random_argument"
]
is
False
self
.
assert
True
(
tokenizer
.
init_kwargs
[
"random_argument"
]
)
self
.
assert
True
(
tokenizer
.
init_kwargs
[
"random_argument"
]
)
self
.
assert
False
(
new_tokenizer
.
init_kwargs
[
"random_argument"
]
)
def
test_get_vocab
(
self
):
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
...
...
@@ -2119,11 +2127,8 @@ class TokenizerTesterMixin:
# Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings
=
hasattr
(
model
.
get_input_embeddings
(),
"weight"
)
assert
(
(
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
>=
len
(
tokenizer
))
if
is_using_common_embeddings
else
True
)
if
is_using_common_embeddings
:
self
.
assertGreaterEqual
(
model
.
get_input_embeddings
().
weight
.
shape
[
0
],
len
(
tokenizer
))
# Build sequence
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
...
...
@@ -2170,7 +2175,7 @@ class TokenizerTesterMixin:
model
=
model_class
(
config
)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
assert
model
.
config
.
vocab_size
>=
len
(
tokenizer
)
self
.
assert
GreaterEqual
(
model
.
config
.
vocab_size
,
len
(
tokenizer
)
)
# Build sequence
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment