Commit bdaba189 authored by thomwolf's avatar thomwolf
Browse files

updating GPT tokenization

parent 18a8a15f
...@@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object): ...@@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object):
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
out_string = out_string.replace('<unk>', '') out_string = out_string.replace('<unk>', '')
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
).replace(" 've", "'ve")
return out_string return out_string
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment