Unverified Commit efae1549 authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

never_split on slow tokenizers should not split (#4723)

* Ensure tokens in never_split are not splitted when using basic tokenizer before wordpiece.

* never_split only use membership attempt to use a set() which is 10x faster for this operation.

* Use union to concatenate two sets.

* Updated docstring for never_split parameter.

* Avoid set.union() if never_split is None

* Added comments.

* Correct docstring format.
parent 2e4de762
...@@ -130,8 +130,8 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -130,8 +130,8 @@ class BertTokenizer(PreTrainedTokenizer):
Whether to lowercase the input when tokenizing. Whether to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to do basic tokenization before WordPiece. Whether to do basic tokenization before WordPiece.
never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): never_split (:obj:`Iterable`, `optional`, defaults to :obj:`None`):
List of tokens which will never be split during tokenization. Only has an effect when Collection of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True` :obj:`do_basic_tokenize=True`
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
...@@ -208,8 +208,12 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -208,8 +208,12 @@ class BertTokenizer(PreTrainedTokenizer):
split_tokens = [] split_tokens = []
if self.do_basic_tokenize: if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token) # If the token is part of the never_split set
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else: else:
split_tokens = self.wordpiece_tokenizer.tokenize(text) split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens return split_tokens
...@@ -363,7 +367,7 @@ class BasicTokenizer(object): ...@@ -363,7 +367,7 @@ class BasicTokenizer(object):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = never_split self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
...@@ -376,8 +380,9 @@ class BasicTokenizer(object): ...@@ -376,8 +380,9 @@ class BasicTokenizer(object):
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split. List of token not to split.
""" """
never_split = self.never_split + (never_split if never_split is not None else []) # union() returns a new set by concatenating the two sets.
text = self._clean_text(text) never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# This was added on November 1st, 2018 for the multilingual and Chinese # This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't # models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data # matter since the English models were not trained on any Chinese data
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment