Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
d2183a46
Unverified
Commit
d2183a46
authored
Jan 06, 2022
by
Nicolas Patry
Committed by
GitHub
Jan 06, 2022
Browse files
Remove old asserts. (#15012)
parent
83c552d3
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
52 deletions
+57
-52
tests/test_tokenization_common.py
tests/test_tokenization_common.py
+57
-52
No files found.
tests/test_tokenization_common.py
View file @
d2183a46
...
@@ -893,7 +893,7 @@ class TokenizerTesterMixin:
...
@@ -893,7 +893,7 @@ class TokenizerTesterMixin:
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
total_length
=
len
(
sequence
)
total_length
=
len
(
sequence
)
assert
total_length
>
4
,
"Issue with the testing sequence, please update it it's too short"
self
.
assert
Greater
(
total_length
,
4
,
"Issue with the testing sequence, please update it it's too short"
)
# Test with max model input length
# Test with max model input length
model_max_length
=
tokenizer
.
model_max_length
model_max_length
=
tokenizer
.
model_max_length
...
@@ -902,9 +902,9 @@ class TokenizerTesterMixin:
...
@@ -902,9 +902,9 @@ class TokenizerTesterMixin:
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
total_length1
=
len
(
sequence1
[
"input_ids"
])
total_length1
=
len
(
sequence1
[
"input_ids"
])
assert
(
self
.
assert
Greater
(
total_length1
>
model_max_length
total_length1
,
model_max_length
,
"Issue with the testing sequence, please update it it's too short"
)
,
"Issue with the testing sequence, please update it it's too short"
)
# Simple
# Simple
padding_strategies
=
(
padding_strategies
=
(
...
@@ -989,7 +989,7 @@ class TokenizerTesterMixin:
...
@@ -989,7 +989,7 @@ class TokenizerTesterMixin:
ids
=
None
ids
=
None
seq0_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
seq0_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
assert
len
(
seq0_tokens
)
>
2
+
stride
self
.
assert
Greater
(
len
(
seq0_tokens
)
,
2
+
stride
)
seq_1
=
"This is another sentence to be encoded."
seq_1
=
"This is another sentence to be encoded."
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
...
@@ -998,7 +998,7 @@ class TokenizerTesterMixin:
...
@@ -998,7 +998,7 @@ class TokenizerTesterMixin:
seq_1
=
tokenizer
.
decode
(
seq1_tokens
,
clean_up_tokenization_spaces
=
False
)
seq_1
=
tokenizer
.
decode
(
seq1_tokens
,
clean_up_tokenization_spaces
=
False
)
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
seq1_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
assert
len
(
seq1_tokens
)
>
2
+
stride
self
.
assert
Greater
(
len
(
seq1_tokens
)
,
2
+
stride
)
smallest
=
seq1_tokens
if
len
(
seq0_tokens
)
>
len
(
seq1_tokens
)
else
seq0_tokens
smallest
=
seq1_tokens
if
len
(
seq0_tokens
)
>
len
(
seq1_tokens
)
else
seq0_tokens
...
@@ -1010,14 +1010,18 @@ class TokenizerTesterMixin:
...
@@ -1010,14 +1010,18 @@ class TokenizerTesterMixin:
model_max_length
=
tokenizer
.
model_max_length
model_max_length
=
tokenizer
.
model_max_length
self
.
assertEqual
(
model_max_length
,
100
)
self
.
assertEqual
(
model_max_length
,
100
)
seq_2
=
seq_0
*
model_max_length
seq_2
=
seq_0
*
model_max_length
assert
len
(
seq_2
)
>
model_max_length
self
.
assert
Greater
(
len
(
seq_2
)
,
model_max_length
)
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
sequence1
=
tokenizer
(
seq_1
,
add_special_tokens
=
False
)
total_length1
=
len
(
sequence1
[
"input_ids"
])
total_length1
=
len
(
sequence1
[
"input_ids"
])
sequence2
=
tokenizer
(
seq_2
,
seq_1
,
add_special_tokens
=
False
)
sequence2
=
tokenizer
(
seq_2
,
seq_1
,
add_special_tokens
=
False
)
total_length2
=
len
(
sequence2
[
"input_ids"
])
total_length2
=
len
(
sequence2
[
"input_ids"
])
assert
total_length1
<
model_max_length
-
10
,
"Issue with the testing sequence, please update it."
self
.
assertLess
(
assert
total_length2
>
model_max_length
,
"Issue with the testing sequence, please update it."
total_length1
,
model_max_length
-
10
,
"Issue with the testing sequence, please update it."
)
self
.
assertGreater
(
total_length2
,
model_max_length
,
"Issue with the testing sequence, please update it."
)
# Simple
# Simple
padding_strategies
=
(
padding_strategies
=
(
...
@@ -1279,7 +1283,7 @@ class TokenizerTesterMixin:
...
@@ -1279,7 +1283,7 @@ class TokenizerTesterMixin:
# # Test first masked sequence
# # Test first masked sequence
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
# assert
len(encoded_masked)
==
len(encoded_0)
#
self.
assert
Equal(
len(encoded_masked)
,
len(encoded_0)
)
# mask_loc = encoded_masked.index(mask_ind)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc]
# encoded_masked[mask_loc] = encoded_0[mask_loc]
...
@@ -1288,7 +1292,7 @@ class TokenizerTesterMixin:
...
@@ -1288,7 +1292,7 @@ class TokenizerTesterMixin:
# # Test second masked sequence
# # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# assert
len(encoded_masked)
==
len(encoded_1)
#
self.
assert
Equal(
len(encoded_masked)
,
len(encoded_1)
)
# mask_loc = encoded_masked.index(mask_ind)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc]
# encoded_masked[mask_loc] = encoded_1[mask_loc]
...
@@ -1356,8 +1360,8 @@ class TokenizerTesterMixin:
...
@@ -1356,8 +1360,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
)
)
padded_sequence_length
=
len
(
padded_sequence
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
self
.
assert
Equal
(
encoded_sequence
+
[
padding_idx
]
*
padding_size
,
padded_sequence
)
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
...
@@ -1367,8 +1371,8 @@ class TokenizerTesterMixin:
...
@@ -1367,8 +1371,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
sequence
,
max_length
=
sequence_length
+
padding_size
,
padding
=
"max_length"
)
)
padded_sequence_length
=
len
(
padded_sequence
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
assert
[
padding_idx
]
*
padding_size
+
encoded_sequence
==
padded_sequence
self
.
assert
Equal
(
[
padding_idx
]
*
padding_size
+
encoded_sequence
,
padded_sequence
)
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
...
@@ -1377,26 +1381,26 @@ class TokenizerTesterMixin:
...
@@ -1377,26 +1381,26 @@ class TokenizerTesterMixin:
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
padding
=
True
)
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
padding
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
"longest"
)
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
"longest"
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_left_length
self
.
assert
Equal
(
sequence_length
,
padded_sequence_left_length
)
assert
encoded_sequence
==
padded_sequence_left
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_left
)
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
)
padded_sequence_right
=
tokenizer
.
encode
(
sequence
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
False
)
padded_sequence_left
=
tokenizer
.
encode
(
sequence
,
padding
=
False
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
padded_sequence_left_length
=
len
(
padded_sequence_left
)
assert
sequence_length
==
padded_sequence_left_length
self
.
assert
Equal
(
sequence_length
,
padded_sequence_left_length
)
assert
encoded_sequence
==
padded_sequence_left
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_left
)
def
test_right_and_left_truncation
(
self
):
def
test_right_and_left_truncation
(
self
):
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
...
@@ -1478,8 +1482,8 @@ class TokenizerTesterMixin:
...
@@ -1478,8 +1482,8 @@ class TokenizerTesterMixin:
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
sequence
,
max_length
=
sequence_length
+
padding_size
,
pad_to_max_length
=
True
)
)
padded_sequence_length
=
len
(
padded_sequence
)
padded_sequence_length
=
len
(
padded_sequence
)
assert
sequence_length
+
padding_size
==
padded_sequence_length
self
.
assert
Equal
(
sequence_length
+
padding_size
,
padded_sequence_length
)
assert
encoded_sequence
+
[
padding_idx
]
*
padding_size
==
padded_sequence
self
.
assert
Equal
(
encoded_sequence
+
[
padding_idx
]
*
padding_size
,
padded_sequence
)
# Check that nothing is done when a maximum length is not specified
# Check that nothing is done when a maximum length is not specified
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
encoded_sequence
=
tokenizer
.
encode
(
sequence
)
...
@@ -1488,8 +1492,8 @@ class TokenizerTesterMixin:
...
@@ -1488,8 +1492,8 @@ class TokenizerTesterMixin:
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right
=
tokenizer
.
encode
(
sequence
,
pad_to_max_length
=
True
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
padded_sequence_right_length
=
len
(
padded_sequence_right
)
assert
sequence_length
==
padded_sequence_right_length
self
.
assert
Equal
(
sequence_length
,
padded_sequence_right_length
)
assert
encoded_sequence
==
padded_sequence_right
self
.
assert
Equal
(
encoded_sequence
,
padded_sequence_right
)
def
test_padding_to_multiple_of
(
self
):
def
test_padding_to_multiple_of
(
self
):
tokenizers
=
self
.
get_tokenizers
()
tokenizers
=
self
.
get_tokenizers
()
...
@@ -1575,9 +1579,9 @@ class TokenizerTesterMixin:
...
@@ -1575,9 +1579,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
assert
sequence_length
==
not_padded_sequence_length
self
.
assert
Equal
(
sequence_length
,
not_padded_sequence_length
)
assert
input_ids
==
not_padded_input_ids
self
.
assert
Equal
(
input_ids
,
not_padded_input_ids
)
assert
special_tokens_mask
==
not_padded_special_tokens_mask
self
.
assert
Equal
(
special_tokens_mask
,
not_padded_special_tokens_mask
)
not_padded_sequence
=
tokenizer
.
encode_plus
(
not_padded_sequence
=
tokenizer
.
encode_plus
(
sequence
,
sequence
,
...
@@ -1589,9 +1593,9 @@ class TokenizerTesterMixin:
...
@@ -1589,9 +1593,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_special_tokens_mask
=
not_padded_sequence
[
"special_tokens_mask"
]
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
not_padded_sequence_length
=
len
(
not_padded_input_ids
)
assert
sequence_length
==
not_padded_sequence_length
self
.
assert
Equal
(
sequence_length
,
not_padded_sequence_length
)
assert
input_ids
==
not_padded_input_ids
self
.
assert
Equal
(
input_ids
,
not_padded_input_ids
)
assert
special_tokens_mask
==
not_padded_special_tokens_mask
self
.
assert
Equal
(
special_tokens_mask
,
not_padded_special_tokens_mask
)
# Test right padding
# Test right padding
tokenizer
.
padding_side
=
"right"
tokenizer
.
padding_side
=
"right"
...
@@ -1607,9 +1611,9 @@ class TokenizerTesterMixin:
...
@@ -1607,9 +1611,9 @@ class TokenizerTesterMixin:
right_padded_special_tokens_mask
=
right_padded_sequence
[
"special_tokens_mask"
]
right_padded_special_tokens_mask
=
right_padded_sequence
[
"special_tokens_mask"
]
right_padded_sequence_length
=
len
(
right_padded_input_ids
)
right_padded_sequence_length
=
len
(
right_padded_input_ids
)
assert
sequence_length
+
padding_size
==
right_padded_sequence_length
self
.
assert
Equal
(
sequence_length
+
padding_size
,
right_padded_sequence_length
)
assert
input_ids
+
[
padding_idx
]
*
padding_size
==
right_padded_input_ids
self
.
assert
Equal
(
input_ids
+
[
padding_idx
]
*
padding_size
,
right_padded_input_ids
)
assert
special_tokens_mask
+
[
1
]
*
padding_size
==
right_padded_special_tokens_mask
self
.
assert
Equal
(
special_tokens_mask
+
[
1
]
*
padding_size
,
right_padded_special_tokens_mask
)
# Test left padding
# Test left padding
tokenizer
.
padding_side
=
"left"
tokenizer
.
padding_side
=
"left"
...
@@ -1623,25 +1627,29 @@ class TokenizerTesterMixin:
...
@@ -1623,25 +1627,29 @@ class TokenizerTesterMixin:
left_padded_special_tokens_mask
=
left_padded_sequence
[
"special_tokens_mask"
]
left_padded_special_tokens_mask
=
left_padded_sequence
[
"special_tokens_mask"
]
left_padded_sequence_length
=
len
(
left_padded_input_ids
)
left_padded_sequence_length
=
len
(
left_padded_input_ids
)
assert
sequence_length
+
padding_size
==
left_padded_sequence_length
self
.
assert
Equal
(
sequence_length
+
padding_size
,
left_padded_sequence_length
)
assert
[
padding_idx
]
*
padding_size
+
input_ids
==
left_padded_input_ids
self
.
assert
Equal
(
[
padding_idx
]
*
padding_size
+
input_ids
,
left_padded_input_ids
)
assert
[
1
]
*
padding_size
+
special_tokens_mask
==
left_padded_special_tokens_mask
self
.
assert
Equal
(
[
1
]
*
padding_size
+
special_tokens_mask
,
left_padded_special_tokens_mask
)
if
"token_type_ids"
in
tokenizer
.
model_input_names
:
if
"token_type_ids"
in
tokenizer
.
model_input_names
:
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
token_type_ids
=
encoded_sequence
[
"token_type_ids"
]
left_padded_token_type_ids
=
left_padded_sequence
[
"token_type_ids"
]
left_padded_token_type_ids
=
left_padded_sequence
[
"token_type_ids"
]
right_padded_token_type_ids
=
right_padded_sequence
[
"token_type_ids"
]
right_padded_token_type_ids
=
right_padded_sequence
[
"token_type_ids"
]
assert
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
==
right_padded_token_type_ids
self
.
assertEqual
(
assert
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
==
left_padded_token_type_ids
token_type_ids
+
[
token_type_padding_idx
]
*
padding_size
,
right_padded_token_type_ids
)
self
.
assertEqual
(
[
token_type_padding_idx
]
*
padding_size
+
token_type_ids
,
left_padded_token_type_ids
)
if
"attention_mask"
in
tokenizer
.
model_input_names
:
if
"attention_mask"
in
tokenizer
.
model_input_names
:
attention_mask
=
encoded_sequence
[
"attention_mask"
]
attention_mask
=
encoded_sequence
[
"attention_mask"
]
right_padded_attention_mask
=
right_padded_sequence
[
"attention_mask"
]
right_padded_attention_mask
=
right_padded_sequence
[
"attention_mask"
]
left_padded_attention_mask
=
left_padded_sequence
[
"attention_mask"
]
left_padded_attention_mask
=
left_padded_sequence
[
"attention_mask"
]
assert
attention_mask
+
[
0
]
*
padding_size
==
right_padded_attention_mask
self
.
assert
Equal
(
attention_mask
+
[
0
]
*
padding_size
,
right_padded_attention_mask
)
assert
[
0
]
*
padding_size
+
attention_mask
==
left_padded_attention_mask
self
.
assert
Equal
(
[
0
]
*
padding_size
+
attention_mask
,
left_padded_attention_mask
)
def
test_separate_tokenizers
(
self
):
def
test_separate_tokenizers
(
self
):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
...
@@ -1652,9 +1660,9 @@ class TokenizerTesterMixin:
...
@@ -1652,9 +1660,9 @@ class TokenizerTesterMixin:
for
tokenizer
,
new_tokenizer
in
zip
(
tokenizers
,
new_tokenizers
):
for
tokenizer
,
new_tokenizer
in
zip
(
tokenizers
,
new_tokenizers
):
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
with
self
.
subTest
(
f
"
{
tokenizer
.
__class__
.
__name__
}
"
):
assert
tokenizer
.
init_kwargs
[
"random_argument"
]
is
True
self
.
assert
True
(
tokenizer
.
init_kwargs
[
"random_argument"
]
)
assert
tokenizer
.
init_kwargs
[
"random_argument"
]
is
True
self
.
assert
True
(
tokenizer
.
init_kwargs
[
"random_argument"
]
)
assert
new_tokenizer
.
init_kwargs
[
"random_argument"
]
is
False
self
.
assert
False
(
new_tokenizer
.
init_kwargs
[
"random_argument"
]
)
def
test_get_vocab
(
self
):
def
test_get_vocab
(
self
):
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
tokenizers
=
self
.
get_tokenizers
(
do_lower_case
=
False
)
...
@@ -2119,11 +2127,8 @@ class TokenizerTesterMixin:
...
@@ -2119,11 +2127,8 @@ class TokenizerTesterMixin:
# Make sure the model contains at least the full vocabulary size in its embedding matrix
# Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings
=
hasattr
(
model
.
get_input_embeddings
(),
"weight"
)
is_using_common_embeddings
=
hasattr
(
model
.
get_input_embeddings
(),
"weight"
)
assert
(
if
is_using_common_embeddings
:
(
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
>=
len
(
tokenizer
))
self
.
assertGreaterEqual
(
model
.
get_input_embeddings
().
weight
.
shape
[
0
],
len
(
tokenizer
))
if
is_using_common_embeddings
else
True
)
# Build sequence
# Build sequence
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
...
@@ -2170,7 +2175,7 @@ class TokenizerTesterMixin:
...
@@ -2170,7 +2175,7 @@ class TokenizerTesterMixin:
model
=
model_class
(
config
)
model
=
model_class
(
config
)
# Make sure the model contains at least the full vocabulary size in its embedding matrix
# Make sure the model contains at least the full vocabulary size in its embedding matrix
assert
model
.
config
.
vocab_size
>=
len
(
tokenizer
)
self
.
assert
GreaterEqual
(
model
.
config
.
vocab_size
,
len
(
tokenizer
)
)
# Build sequence
# Build sequence
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
first_ten_tokens
=
list
(
tokenizer
.
get_vocab
().
keys
())[:
10
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment