Commit 24831477 authored by thomwolf's avatar thomwolf
Browse files

fix tokenization

parent 03c2c762
...@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -55,7 +55,7 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "adapt react readapt apt" text = "adapt react readapt apt"
bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split() bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split()
tokens = tokenizer.tokenize(text, add_prefix_space=True) tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token] input_tokens = tokens + [tokenizer.unk_token]
......
...@@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer): ...@@ -205,7 +205,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """ """ Converts a sequence of tokens (string) in a single string. """
out_string = ''.join(tokens).replace('@@', ' ').strip() out_string = ' '.join(tokens).replace('@@ ', '').strip()
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment