Unverified Commit 5739726f authored by Connor Henderson's avatar Connor Henderson Committed by GitHub
Browse files

fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer

* account for apostrophe at start of new word

* remove _run_split_on_punc, use re.findall instead

* remove debugging, make style and quality

* use pattern and punc splitting, repo-consistency will fail

* remove commented out debugging

* adds bool args to BasicTokenizer, remove pattern

* do_split_on_punc default True

* clean stray comments and line breaks

* rebase, repo-consistency

* update to just do punctuation split

* add unicode normalizing back

* remove redundant line
parent 2489e380
...@@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
) )
def test_basic_tokenizer_splits_on_punctuation(self):
tokenizer = BasicTokenizer()
text = "a\n'll !!to?'d of, can't."
expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
self.assertListEqual(tokenizer.tokenize(text), expected)
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
......
...@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat" text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
text_tokenized_s = tokenizer_s.tokenize(text) text_tokenized_s = tokenizer_s.tokenize(text)
text_tokenized_r = tokenizer_r.tokenize(text) text_tokenized_r = tokenizer_r.tokenize(text)
...@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# "\u0085", # (next line) # "\u0085", # (next line)
] ]
# The tokenization is not identical for the character "\u0085" (next line). The slow version transforms # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
# space (and thus into an empty list). # space (and thus into an empty list).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment