Unverified Commit 851f253f authored by Ren Xuancheng's avatar Ren Xuancheng Committed by GitHub
Browse files

Fix Qwen2Tokenizer (#29929)



qwen2: fixed tokens starting with # in slow tokenizer; add tests
Co-authored-by: default avatarjklj077 <17811943+jklj077@users.noreply.github.com>
parent 17b06e2c
...@@ -177,9 +177,9 @@ class Qwen2Tokenizer(PreTrainedTokenizer): ...@@ -177,9 +177,9 @@ class Qwen2Tokenizer(PreTrainedTokenizer):
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_merges = [] bpe_merges = []
with open(merges_file, encoding="utf-8") as merges_handle: with open(merges_file, encoding="utf-8") as merges_handle:
for line in merges_handle: for i, line in enumerate(merges_handle):
line = line.strip() line = line.strip()
if not line or line.startswith("#"): if (i == 0 and line.startswith("#version:")) or not line:
continue continue
bpe_merges.append(tuple(line.split())) bpe_merges.append(tuple(line.split()))
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
......
...@@ -59,6 +59,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -59,6 +59,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
";}", ";}",
";}\u010a", ";}\u010a",
"\u00cf\u0135", "\u00cf\u0135",
"\u0120#",
"##",
] ]
) )
...@@ -75,6 +77,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -75,6 +77,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"; }", "; }",
";} \u010a", ";} \u010a",
"\u00cf \u0135", "\u00cf \u0135",
"\u0120 #",
"# #",
] ]
self.special_tokens_map = {"eos_token": "<|endoftext|>"} self.special_tokens_map = {"eos_token": "<|endoftext|>"}
...@@ -129,7 +133,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -129,7 +133,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, bpe_tokens) self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens input_tokens = tokens
input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 268, 267] input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 270, 267]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@unittest.skip("We disable the test of pretokenization as it is not reversible.") @unittest.skip("We disable the test of pretokenization as it is not reversible.")
...@@ -139,6 +143,11 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -139,6 +143,11 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# the results, by nature, should be different. # the results, by nature, should be different.
pass pass
@unittest.skip("We disable the test of clean up tokenization spaces as it is not applicable.")
def test_clean_up_tokenization_spaces(self):
# it only tests bert-base-uncased and clean_up_tokenization_spaces is not applicable to this tokenizer
pass
def test_nfc_normalization(self): def test_nfc_normalization(self):
# per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms # per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms
# under NFC, NFD, NFKC, and NFKD are all different # under NFC, NFD, NFKC, and NFKD are all different
...@@ -158,6 +167,16 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -158,6 +167,16 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_output_string = tokenizer.backend_tokenizer.normalizer.normalize_str(input_string) tokenizer_output_string = tokenizer.backend_tokenizer.normalizer.normalize_str(input_string)
self.assertEqual(tokenizer_output_string, output_string) self.assertEqual(tokenizer_output_string, output_string)
def test_slow_tokenizer_token_with_number_sign(self):
if not self.test_slow_tokenizer:
return
sequence = " ###"
token_ids = [268, 269]
tokenizer = self.get_tokenizer()
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequence)), token_ids)
def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self): def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self):
# Qwen2Tokenizer changes the default `spaces_between_special_tokens` in `decode` to False # Qwen2Tokenizer changes the default `spaces_between_special_tokens` in `decode` to False
if not self.test_slow_tokenizer: if not self.test_slow_tokenizer:
...@@ -166,7 +185,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -166,7 +185,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens` # tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens`
# special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens` # special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens`
# that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models # that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models
token_ids = [259, 260, 268, 269, 26] token_ids = [259, 260, 270, 271, 26]
sequence = " lower<|endoftext|><|im_start|>;" sequence = " lower<|endoftext|><|im_start|>;"
sequence_with_space = " lower<|endoftext|> <|im_start|> ;" sequence_with_space = " lower<|endoftext|> <|im_start|> ;"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment