"...composable_kernel.git" did not exist on "755ace59576810031ea1818e2432ef0364b23080"
Commit df160af7 authored by Pascal Voitot's avatar Pascal Voitot Committed by Lysandre Debut
Browse files

🐛 #2096 in tokenizer.decode, space is not joined between all subtexts...

🐛 #2096 in tokenizer.decode, space is not joined between all subtexts instead of before added tokens
parent 5b7b78e0
...@@ -99,6 +99,21 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -99,6 +99,21 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
self.assertListEqual( self.assertListEqual(
tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
def test_encode_decode_with_spaces(self):
tokenizer = self.get_tokenizer()
new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
tokenizer.add_tokens(new_toks)
input = "unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]"
encoded = tokenizer.encode(input)
decoded = tokenizer.decode(encoded)
self.assertEqual(
decoded.lower(),
(f"[CLS] {input.lower()} [SEP]").lower()
)
def test_is_whitespace(self): def test_is_whitespace(self):
self.assertTrue(_is_whitespace(u" ")) self.assertTrue(_is_whitespace(u" "))
self.assertTrue(_is_whitespace(u"\t")) self.assertTrue(_is_whitespace(u"\t"))
...@@ -139,5 +154,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -139,5 +154,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_sentence == [101] + text + [102] assert encoded_sentence == [101] + text + [102]
assert encoded_pair == [101] + text + [102] + text_2 + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -1180,12 +1180,12 @@ class PreTrainedTokenizer(object): ...@@ -1180,12 +1180,12 @@ class PreTrainedTokenizer(object):
if current_sub_text: if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) sub_texts.append(self.convert_tokens_to_string(current_sub_text))
current_sub_text = [] current_sub_text = []
sub_texts.append(" " + token + " ") sub_texts.append(token)
else: else:
current_sub_text.append(token) current_sub_text.append(token)
if current_sub_text: if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = ''.join(sub_texts) text = ' '.join(sub_texts)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment