"git@developer.sourcefind.cn:chenpangpang/ComfyUI.git" did not exist on "0d358b95665a4a2c3edbdf2a4e51920e4c773fb1"
Commit 65a89a89 authored by Mark Neumann's avatar Mark Neumann Committed by Julien Chaumond
Browse files

Fix BasicTokenizer to respect `never_split` parameters (#2557)

* add failing test

* fix call to _run_split_on_punc

* format with black
parent 6d5049a2
...@@ -341,7 +341,7 @@ class BasicTokenizer(object): ...@@ -341,7 +341,7 @@ class BasicTokenizer(object):
if self.do_lower_case and token not in never_split: if self.do_lower_case and token not in never_split:
token = token.lower() token = token.lower()
token = self._run_strip_accents(token) token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token)) split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens)) output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens return output_tokens
......
...@@ -119,6 +119,13 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -119,6 +119,13 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
) )
def test_basic_tokenizer_respects_never_split_tokens(self):
tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
self.assertListEqual(
tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
)
def test_wordpiece_tokenizer(self): def test_wordpiece_tokenizer(self):
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment