Merge pull request #1027 from samvelyan/iterative_split_on_token

Re-implemented tokenize() iteratively in PreTrainedTokenizer.

Merge pull request #1027 from samvelyan/iterative_split_on_token
Re-implemented tokenize() iteratively in PreTrainedTokenizer.
260c8608 · Thomas Wolf · GitHub · 9beaa85b · d30cbaf5 · 260c8608
Unverified Commit 260c8608 authored Aug 21, 2019 by Thomas Wolf Committed by GitHub Aug 21, 2019
Show whitespace changes
Inline Side-by-side

Showing with 34 additions and 4 deletions

pytorch_transformers/tokenization_utils.py pytorch_transformers/tokenization_utils.py +34 -4

No files found.
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -477,15 +477,45 @@ class PreTrainedTokenizer(object):

            Take care of added tokens.
        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+
        def split_on_tokens(tok_list, text):
            if not text:
                return []
            if not tok_list:
                return self._tokenize(text, **kwargs)
-            tok = tok_list[0]
-            split_text = text.split(tok)
-            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
-                        for sub_text in split_text), [])[:-1]
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.added_tokens_encoder \
+                            and sub_text not in self.all_special_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+
+            return sum((self._tokenize(token, **kwargs) if token not \
+                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    else [token] for token in tokenized_text), [])

        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)