Unverified Commit d2183a46 authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Remove old asserts. (#15012)

parent 83c552d3
...@@ -893,7 +893,7 @@ class TokenizerTesterMixin: ...@@ -893,7 +893,7 @@ class TokenizerTesterMixin:
sequence = tokenizer.encode(seq_0, add_special_tokens=False) sequence = tokenizer.encode(seq_0, add_special_tokens=False)
total_length = len(sequence) total_length = len(sequence)
assert total_length > 4, "Issue with the testing sequence, please update it it's too short" self.assertGreater(total_length, 4, "Issue with the testing sequence, please update it it's too short")
# Test with max model input length # Test with max model input length
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
...@@ -902,9 +902,9 @@ class TokenizerTesterMixin: ...@@ -902,9 +902,9 @@ class TokenizerTesterMixin:
sequence1 = tokenizer(seq_1, add_special_tokens=False) sequence1 = tokenizer(seq_1, add_special_tokens=False)
total_length1 = len(sequence1["input_ids"]) total_length1 = len(sequence1["input_ids"])
assert ( self.assertGreater(
total_length1 > model_max_length total_length1, model_max_length, "Issue with the testing sequence, please update it it's too short"
), "Issue with the testing sequence, please update it it's too short" )
# Simple # Simple
padding_strategies = ( padding_strategies = (
...@@ -989,7 +989,7 @@ class TokenizerTesterMixin: ...@@ -989,7 +989,7 @@ class TokenizerTesterMixin:
ids = None ids = None
seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False) seq0_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
assert len(seq0_tokens) > 2 + stride self.assertGreater(len(seq0_tokens), 2 + stride)
seq_1 = "This is another sentence to be encoded." seq_1 = "This is another sentence to be encoded."
seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
...@@ -998,7 +998,7 @@ class TokenizerTesterMixin: ...@@ -998,7 +998,7 @@ class TokenizerTesterMixin:
seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False) seq_1 = tokenizer.decode(seq1_tokens, clean_up_tokenization_spaces=False)
seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False) seq1_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
assert len(seq1_tokens) > 2 + stride self.assertGreater(len(seq1_tokens), 2 + stride)
smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens smallest = seq1_tokens if len(seq0_tokens) > len(seq1_tokens) else seq0_tokens
...@@ -1010,14 +1010,18 @@ class TokenizerTesterMixin: ...@@ -1010,14 +1010,18 @@ class TokenizerTesterMixin:
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
self.assertEqual(model_max_length, 100) self.assertEqual(model_max_length, 100)
seq_2 = seq_0 * model_max_length seq_2 = seq_0 * model_max_length
assert len(seq_2) > model_max_length self.assertGreater(len(seq_2), model_max_length)
sequence1 = tokenizer(seq_1, add_special_tokens=False) sequence1 = tokenizer(seq_1, add_special_tokens=False)
total_length1 = len(sequence1["input_ids"]) total_length1 = len(sequence1["input_ids"])
sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False) sequence2 = tokenizer(seq_2, seq_1, add_special_tokens=False)
total_length2 = len(sequence2["input_ids"]) total_length2 = len(sequence2["input_ids"])
assert total_length1 < model_max_length - 10, "Issue with the testing sequence, please update it." self.assertLess(
assert total_length2 > model_max_length, "Issue with the testing sequence, please update it." total_length1, model_max_length - 10, "Issue with the testing sequence, please update it."
)
self.assertGreater(
total_length2, model_max_length, "Issue with the testing sequence, please update it."
)
# Simple # Simple
padding_strategies = ( padding_strategies = (
...@@ -1279,7 +1283,7 @@ class TokenizerTesterMixin: ...@@ -1279,7 +1283,7 @@ class TokenizerTesterMixin:
# # Test first masked sequence # # Test first masked sequence
# encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False) # encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False) # encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
# assert len(encoded_masked) == len(encoded_0) # self.assertEqual(len(encoded_masked), len(encoded_0))
# mask_loc = encoded_masked.index(mask_ind) # mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc] # encoded_masked[mask_loc] = encoded_0[mask_loc]
...@@ -1288,7 +1292,7 @@ class TokenizerTesterMixin: ...@@ -1288,7 +1292,7 @@ class TokenizerTesterMixin:
# # Test second masked sequence # # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False) # encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False) # encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# assert len(encoded_masked) == len(encoded_1) # self.assertEqual(len(encoded_masked), len(encoded_1))
# mask_loc = encoded_masked.index(mask_ind) # mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc] # encoded_masked[mask_loc] = encoded_1[mask_loc]
...@@ -1356,8 +1360,8 @@ class TokenizerTesterMixin: ...@@ -1356,8 +1360,8 @@ class TokenizerTesterMixin:
sequence, max_length=sequence_length + padding_size, padding="max_length" sequence, max_length=sequence_length + padding_size, padding="max_length"
) )
padded_sequence_length = len(padded_sequence) padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length self.assertEqual(sequence_length + padding_size, padded_sequence_length)
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
...@@ -1367,8 +1371,8 @@ class TokenizerTesterMixin: ...@@ -1367,8 +1371,8 @@ class TokenizerTesterMixin:
sequence, max_length=sequence_length + padding_size, padding="max_length" sequence, max_length=sequence_length + padding_size, padding="max_length"
) )
padded_sequence_length = len(padded_sequence) padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length self.assertEqual(sequence_length + padding_size, padded_sequence_length)
assert [padding_idx] * padding_size + encoded_sequence == padded_sequence self.assertEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding' # RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
encoded_sequence = tokenizer.encode(sequence) encoded_sequence = tokenizer.encode(sequence)
...@@ -1377,26 +1381,26 @@ class TokenizerTesterMixin: ...@@ -1377,26 +1381,26 @@ class TokenizerTesterMixin:
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence, padding=True) padded_sequence_right = tokenizer.encode(sequence, padding=True)
padded_sequence_right_length = len(padded_sequence_right) padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length self.assertEqual(sequence_length, padded_sequence_right_length)
assert encoded_sequence == padded_sequence_right self.assertEqual(encoded_sequence, padded_sequence_right)
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(sequence, padding="longest") padded_sequence_left = tokenizer.encode(sequence, padding="longest")
padded_sequence_left_length = len(padded_sequence_left) padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_left_length self.assertEqual(sequence_length, padded_sequence_left_length)
assert encoded_sequence == padded_sequence_left self.assertEqual(encoded_sequence, padded_sequence_left)
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence) padded_sequence_right = tokenizer.encode(sequence)
padded_sequence_right_length = len(padded_sequence_right) padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length self.assertEqual(sequence_length, padded_sequence_right_length)
assert encoded_sequence == padded_sequence_right self.assertEqual(encoded_sequence, padded_sequence_right)
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(sequence, padding=False) padded_sequence_left = tokenizer.encode(sequence, padding=False)
padded_sequence_left_length = len(padded_sequence_left) padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_left_length self.assertEqual(sequence_length, padded_sequence_left_length)
assert encoded_sequence == padded_sequence_left self.assertEqual(encoded_sequence, padded_sequence_left)
def test_right_and_left_truncation(self): def test_right_and_left_truncation(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
...@@ -1478,8 +1482,8 @@ class TokenizerTesterMixin: ...@@ -1478,8 +1482,8 @@ class TokenizerTesterMixin:
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
) )
padded_sequence_length = len(padded_sequence) padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length self.assertEqual(sequence_length + padding_size, padded_sequence_length)
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
# Check that nothing is done when a maximum length is not specified # Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(sequence) encoded_sequence = tokenizer.encode(sequence)
...@@ -1488,8 +1492,8 @@ class TokenizerTesterMixin: ...@@ -1488,8 +1492,8 @@ class TokenizerTesterMixin:
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right) padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length self.assertEqual(sequence_length, padded_sequence_right_length)
assert encoded_sequence == padded_sequence_right self.assertEqual(encoded_sequence, padded_sequence_right)
def test_padding_to_multiple_of(self): def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
...@@ -1575,9 +1579,9 @@ class TokenizerTesterMixin: ...@@ -1575,9 +1579,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids) not_padded_sequence_length = len(not_padded_input_ids)
assert sequence_length == not_padded_sequence_length self.assertEqual(sequence_length, not_padded_sequence_length)
assert input_ids == not_padded_input_ids self.assertEqual(input_ids, not_padded_input_ids)
assert special_tokens_mask == not_padded_special_tokens_mask self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
not_padded_sequence = tokenizer.encode_plus( not_padded_sequence = tokenizer.encode_plus(
sequence, sequence,
...@@ -1589,9 +1593,9 @@ class TokenizerTesterMixin: ...@@ -1589,9 +1593,9 @@ class TokenizerTesterMixin:
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"] not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
not_padded_sequence_length = len(not_padded_input_ids) not_padded_sequence_length = len(not_padded_input_ids)
assert sequence_length == not_padded_sequence_length self.assertEqual(sequence_length, not_padded_sequence_length)
assert input_ids == not_padded_input_ids self.assertEqual(input_ids, not_padded_input_ids)
assert special_tokens_mask == not_padded_special_tokens_mask self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
# Test right padding # Test right padding
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
...@@ -1607,9 +1611,9 @@ class TokenizerTesterMixin: ...@@ -1607,9 +1611,9 @@ class TokenizerTesterMixin:
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
right_padded_sequence_length = len(right_padded_input_ids) right_padded_sequence_length = len(right_padded_input_ids)
assert sequence_length + padding_size == right_padded_sequence_length self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
assert input_ids + [padding_idx] * padding_size == right_padded_input_ids self.assertEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
# Test left padding # Test left padding
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
...@@ -1623,25 +1627,29 @@ class TokenizerTesterMixin: ...@@ -1623,25 +1627,29 @@ class TokenizerTesterMixin:
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids) left_padded_sequence_length = len(left_padded_input_ids)
assert sequence_length + padding_size == left_padded_sequence_length self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
assert [padding_idx] * padding_size + input_ids == left_padded_input_ids self.assertEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask self.assertEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
if "token_type_ids" in tokenizer.model_input_names: if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"] token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"] left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"] right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids self.assertEqual(
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids token_type_ids + [token_type_padding_idx] * padding_size, right_padded_token_type_ids
)
self.assertEqual(
[token_type_padding_idx] * padding_size + token_type_ids, left_padded_token_type_ids
)
if "attention_mask" in tokenizer.model_input_names: if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"] attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"] right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"] left_padded_attention_mask = left_padded_sequence["attention_mask"]
assert attention_mask + [0] * padding_size == right_padded_attention_mask self.assertEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
assert [0] * padding_size + attention_mask == left_padded_attention_mask self.assertEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
def test_separate_tokenizers(self): def test_separate_tokenizers(self):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when # This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
...@@ -1652,9 +1660,9 @@ class TokenizerTesterMixin: ...@@ -1652,9 +1660,9 @@ class TokenizerTesterMixin:
for tokenizer, new_tokenizer in zip(tokenizers, new_tokenizers): for tokenizer, new_tokenizer in zip(tokenizers, new_tokenizers):
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
assert tokenizer.init_kwargs["random_argument"] is True self.assertTrue(tokenizer.init_kwargs["random_argument"])
assert tokenizer.init_kwargs["random_argument"] is True self.assertTrue(tokenizer.init_kwargs["random_argument"])
assert new_tokenizer.init_kwargs["random_argument"] is False self.assertFalse(new_tokenizer.init_kwargs["random_argument"])
def test_get_vocab(self): def test_get_vocab(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
...@@ -2119,11 +2127,8 @@ class TokenizerTesterMixin: ...@@ -2119,11 +2127,8 @@ class TokenizerTesterMixin:
# Make sure the model contains at least the full vocabulary size in its embedding matrix # Make sure the model contains at least the full vocabulary size in its embedding matrix
is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight") is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
assert ( if is_using_common_embeddings:
(model.get_input_embeddings().weight.shape[0] >= len(tokenizer)) self.assertGreaterEqual(model.get_input_embeddings().weight.shape[0], len(tokenizer))
if is_using_common_embeddings
else True
)
# Build sequence # Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
...@@ -2170,7 +2175,7 @@ class TokenizerTesterMixin: ...@@ -2170,7 +2175,7 @@ class TokenizerTesterMixin:
model = model_class(config) model = model_class(config)
# Make sure the model contains at least the full vocabulary size in its embedding matrix # Make sure the model contains at least the full vocabulary size in its embedding matrix
assert model.config.vocab_size >= len(tokenizer) self.assertGreaterEqual(model.config.vocab_size, len(tokenizer))
# Build sequence # Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment