Fix BasicTokenizer to respect `never_split` parameters (#2557)

* add failing test * fix call to _run_split_on_punc * format with black

Fix BasicTokenizer to respect `never_split` parameters (#2557)
* add failing test * fix call to _run_split_on_punc * format with black
65a89a89 · Mark Neumann · Julien Chaumond · 6d5049a2 · 65a89a89 · 65a89a89
Commit 65a89a89 authored Jan 17, 2020 by Mark Neumann Committed by Julien Chaumond Jan 17, 2020
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 1 deletion

src/transformers/tokenization_bert.py src/transformers/tokenization_bert.py +1 -1

tests/test_tokenization_bert.py tests/test_tokenization_bert.py +7 -0

No files found.
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -341,7 +341,7 @@ class BasicTokenizer(object):
            if self.do_lower_case and token not in never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
+            split_tokens.extend(self._run_split_on_punc(token, never_split))

        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens

--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -119,6 +119,13 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
        )

+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
    def test_wordpiece_tokenizer(self):
        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]