Unverified Commit 21ed3a6b authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

Reintroduce clean_text on BertTokenizer call which was removed by mistake in #4723 (#5749)



* Reintroduce clean_text call which was removed by mistake in #4723
Signed-off-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>

* Added unittest for clean_text parameter on Bert tokenizer.
Signed-off-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>

* Better unittest name.
Signed-off-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>

* Adapt unittest to use untrained tokenizer.
Signed-off-by: default avatarMorgan Funtowicz <funtowiczmo@gmail.com>

* Code quality + update test
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent 5668fdb0
...@@ -398,6 +398,7 @@ class BasicTokenizer(object): ...@@ -398,6 +398,7 @@ class BasicTokenizer(object):
""" """
# union() returns a new set by concatenating the two sets. # union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese # This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't # models. This is also applied to the English models now, but it doesn't
......
...@@ -222,6 +222,17 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -222,6 +222,17 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertFalse(_is_punctuation("A")) self.assertFalse(_is_punctuation("A"))
self.assertFalse(_is_punctuation(" ")) self.assertFalse(_is_punctuation(" "))
def test_clean_text(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
self.assertListEqual(
[rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
)
@slow @slow
def test_sequence_builders(self): def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased") tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment