"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "0148c262e79f5ca12140d7fc35a6d3e0d80d5d3b"
Commit 6179f537 authored by thomwolf's avatar thomwolf
Browse files

clean up tokenization spaces

parent 850da1cc
...@@ -225,8 +225,14 @@ class OpenAIGPTTokenizer(object): ...@@ -225,8 +225,14 @@ class OpenAIGPTTokenizer(object):
tokens.append(self.decoder[i]) tokens.append(self.decoder[i])
return tokens return tokens
def decode(self, ids, skip_special_tokens=False): def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
"""Converts a sequence of ids in a string.""" """Converts a sequence of ids in a string."""
tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens) tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
if clean_up_tokenization_spaces:
out_string = out_string.replace('<unk>', '')
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
).replace(" 've", "'ve")
return out_string return out_string
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment