updating GPT tokenization

bdaba189 · thomwolf · 18a8a15f · bdaba189
Commit bdaba189 authored Apr 16, 2019 by thomwolf
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 3 deletions

pytorch_pretrained_bert/tokenization_openai.py pytorch_pretrained_bert/tokenization_openai.py +2 -3

No files found.
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object):
        if clean_up_tokenization_spaces:
            out_string = out_string.replace('<unk>', '')
            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
-                    ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-                    ).replace(" 've", "'ve")
        return out_string
    def save_vocabulary(self, vocab_path):