"...git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "612e801470308bbe35a53a0f50b9e8b74d2ec3f0"
Unverified Commit 66ea7391 authored by Li-Huai (Allan) Lin's avatar Li-Huai (Allan) Lin Committed by GitHub
Browse files

Improve tokenizer tests (#13594)

* Use new method to acquire tokenizers

* Resolve TODOs.

* Style

* Fix

* Enable do_lower_case in test_tokenize_special_tokens

* Apply suggestion from code review

* Fix mask token handling

* Revert "Fix mask token handling"

This reverts commit daaa3f5291b1f71e5bc3604ca281c000000c4648.

* Fix FNet mask token tokenization

* Complete everything

* Apply suggestions from code review
parent 6645eb61
...@@ -314,7 +314,7 @@ class TokenizerTesterMixin: ...@@ -314,7 +314,7 @@ class TokenizerTesterMixin:
# TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers. # TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
def test_tokenize_special_tokens(self): def test_tokenize_special_tokens(self):
"""Test `tokenize` with special tokens.""" """Test `tokenize` with special tokens."""
tokenizers = self.get_tokenizers(fast=True) tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]" SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
...@@ -620,8 +620,7 @@ class TokenizerTesterMixin: ...@@ -620,8 +620,7 @@ class TokenizerTesterMixin:
self.assertEqual(tok1.__getstate__(), tok2.__getstate__()) self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
def test_added_tokens_do_lower_case(self): def test_added_tokens_do_lower_case(self):
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens. tokenizers = self.get_tokenizers(do_lower_case=True)
tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else []
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case: if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
...@@ -632,30 +631,34 @@ class TokenizerTesterMixin: ...@@ -632,30 +631,34 @@ class TokenizerTesterMixin:
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
toks0 = tokenizer.tokenize(text) # toks before adding new_toks toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
added = tokenizer.add_tokens(new_toks) added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
self.assertEqual(added, 2)
toks = tokenizer.tokenize(text) toks_after_adding = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks_after_adding2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2)) # Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`,
self.assertListEqual(toks, toks2) # while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
if not isinstance(tokenizer, PreTrainedTokenizerFast): self.assertIn(added, [2, 4])
# Python tokenizers can have added tokens with spaces inside them
# cf https://github.com/huggingface/tokenizers/issues/302 self.assertListEqual(toks_after_adding, toks_after_adding2)
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer self.assertTrue(
len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
)
# Check that none of the special tokens are lowercased # Check that none of the special tokens are lowercased
sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) # Convert the tokenized list to str as some special tokens are tokenized like normal tokens
# which have a prefix spacee e.g. the mask token of Albert, and cannot match the original
# special tokens exactly.
tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))
for special_token in tokenizer.all_special_tokens: for special_token in tokenizer.all_special_tokens:
self.assertTrue(special_token in tokenized_sequence) self.assertTrue(special_token in tokenized_sequence)
tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else [] tokenizers = self.get_tokenizers(do_lower_case=True)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case: if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
...@@ -666,22 +669,22 @@ class TokenizerTesterMixin: ...@@ -666,22 +669,22 @@ class TokenizerTesterMixin:
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
added = tokenizer.add_tokens(new_toks) new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
self.assertIn(added, [2, 4]) self.assertIn(added, [2, 4])
toks = tokenizer.tokenize(text) toks_after_adding = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks_after_adding2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2)) # Length should still be the same self.assertEqual(len(toks_after_adding), len(toks_after_adding2)) # Length should still be the same
self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ self.assertNotEqual(
if not isinstance(tokenizer, PreTrainedTokenizerFast): toks_after_adding[1], toks_after_adding2[1]
# Python tokenizers can have added tokens with spaces inside them ) # But at least the first non-special tokens should differ
# cf https://github.com/huggingface/tokenizers/issues/302 self.assertTrue(
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
)
def test_add_tokens_tokenizer(self): def test_add_tokens_tokenizer(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
...@@ -780,12 +783,15 @@ class TokenizerTesterMixin: ...@@ -780,12 +783,15 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
# new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"] new_toks = [
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)] AddedToken("[ABC]", normalized=False),
AddedToken("[DEF]", normalized=False),
AddedToken("GHI IHG", normalized=False),
]
tokenizer.add_tokens(new_toks) tokenizer.add_tokens(new_toks)
input = "[ABC][DEF][ABC][DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]" input = "[ABC][DEF][ABC]GHI IHG[DEF]"
if self.space_between_special_tokens: if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] [DEF]" output = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
else: else:
output = input output = input
encoded = tokenizer.encode(input, add_special_tokens=False) encoded = tokenizer.encode(input, add_special_tokens=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment