"tests/quantization/vscode:/vscode.git/clone" did not exist on "e3fc90ae680becbe90eb5bbf58b409cae76ff8c8"
Unverified Commit 1e79eade authored by Ita Zaporozhets's avatar Ita Zaporozhets Committed by GitHub
Browse files

SPLIT PR: add user defined symbols and control symbols (#31305)

* PR SPLIT: moving origina changes for adding user defined symbols

* adding gemma test and generalizing gemma converter

* ruff

* update common test

* update serialization test

* deberta v2 tests updates as rust version adds '.' as a user added token, so a space is not added

* removing commented lines

* applying feedback - user only added_tokens to add and check piece.type instead of trainer_spec for user_defined_symbols

* add comment referencing sentencepiece
parent 730a4407
...@@ -622,6 +622,17 @@ class SpmConverter(Converter): ...@@ -622,6 +622,17 @@ class SpmConverter(Converter):
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto) tokenizer = self.tokenizer(self.proto)
# Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
user_defined_symbols = [
AddedToken(token, normalized=False, special=False)
for token in [p.piece for p in self.proto.pieces if p.type == 4]
]
control_symbols = [
AddedToken(token, normalized=False, special=True) for token in self.proto.trainer_spec.control_symbols
]
tokenizer.add_tokens(user_defined_symbols + control_symbols)
# Tokenizer assemble # Tokenizer assemble
normalizer = self.normalizer(self.proto) normalizer = self.normalizer(self.proto)
if normalizer is not None: if normalizer is not None:
...@@ -1330,10 +1341,6 @@ class GemmaConvert(SpmConverter): ...@@ -1330,10 +1341,6 @@ class GemmaConvert(SpmConverter):
raise Exception( raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm" "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
) )
user_defined_symbols = [
AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols
]
tokenizer.add_tokens(user_defined_symbols)
return tokenizer return tokenizer
......
...@@ -144,7 +144,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -144,7 +144,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
self.assertDictEqual(expected, tokenizer.added_tokens_decoder) self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
return tokenizer return tokenizer
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False) new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False)
...@@ -198,7 +198,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -198,7 +198,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
self.assertTrue(
all(
item in tokenizer.added_tokens_decoder.items()
for item in EXPECTED_ADDED_TOKENS_DECODER.items()
)
)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
with tempfile.TemporaryDirectory() as tmp_dir_4: with tempfile.TemporaryDirectory() as tmp_dir_4:
......
...@@ -89,8 +89,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -89,8 +89,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_split_by_punct(self): def test_split_by_punct(self):
# fmt: off # fmt: off
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ] tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", "!", ]
# fmt: on # fmt: on
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", split_by_punct=True) tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", split_by_punct=True)
...@@ -105,8 +105,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -105,8 +105,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_do_lower_case_split_by_punct(self): def test_do_lower_case_split_by_punct(self):
# fmt: off # fmt: off
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ] tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", "!", ]
# fmt: on # fmt: on
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=True) tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=True)
...@@ -121,8 +121,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -121,8 +121,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_do_lower_case_split_by_punct_false(self): def test_do_lower_case_split_by_punct_false(self):
# fmt: off # fmt: off
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ] tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "!", ]
# fmt: on # fmt: on
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=False) tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=True, split_by_punct=False)
...@@ -139,8 +139,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -139,8 +139,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_do_lower_case_false_split_by_punct(self): def test_do_lower_case_false_split_by_punct(self):
# fmt: off # fmt: off
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", ".", ] tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "▁", "!", ]
# fmt: on # fmt: on
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=True) tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>", do_lower_case=False, split_by_punct=True)
...@@ -177,7 +177,7 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -177,7 +177,7 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer() rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
...@@ -216,10 +216,10 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -216,10 +216,10 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(rust_back_tokens, back_tokens_target) self.assertListEqual(rust_back_tokens, back_tokens_target)
# fmt: off # fmt: off
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé!"
ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9] ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 187]
tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ] tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "!", ]
back_tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", ".", ] back_tokens_target = ["▁", "<unk>", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "<unk>", "!", ]
# fmt: on # fmt: on
ids = tokenizer.encode(sequence, add_special_tokens=False) ids = tokenizer.encode(sequence, add_special_tokens=False)
......
...@@ -193,6 +193,19 @@ class GemmaIntegrationTest(unittest.TestCase): ...@@ -193,6 +193,19 @@ class GemmaIntegrationTest(unittest.TestCase):
}, },
) )
def test_user_added_tokens(self):
# Ensure that user added tokens are not split in the fast tokenizer
slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer
user_added_token = "<mask>"
slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token))
fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token))
self.assertTrue(user_added_token in fast_tokens)
self.assertEqual(slow_tokens, fast_tokens)
def test_fast_special_tokens(self): def test_fast_special_tokens(self):
slow_tokenizer = self.tokenizer slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer fast_tokenizer = self.rust_tokenizer
......
...@@ -172,7 +172,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -172,7 +172,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
self.assertDictEqual(expected, tokenizer.added_tokens_decoder) self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
return tokenizer return tokenizer
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
...@@ -227,7 +227,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -227,7 +227,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) self.assertTrue(
all(
item in tokenizer.added_tokens_decoder.items()
for item in EXPECTED_ADDED_TOKENS_DECODER.items()
)
)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
with tempfile.TemporaryDirectory() as tmp_dir_4: with tempfile.TemporaryDirectory() as tmp_dir_4:
......
...@@ -4228,7 +4228,7 @@ class TokenizerTesterMixin: ...@@ -4228,7 +4228,7 @@ class TokenizerTesterMixin:
self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
self.assertDictEqual(expected, tokenizer.added_tokens_decoder) self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
return tokenizer return tokenizer
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
...@@ -4280,7 +4280,13 @@ class TokenizerTesterMixin: ...@@ -4280,7 +4280,13 @@ class TokenizerTesterMixin:
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) # Fast tokenizer may have user_defined_symbols and control_symbols added, unlike slow
self.assertTrue(
all(
item in tokenizer.added_tokens_decoder.items()
for item in EXPECTED_ADDED_TOKENS_DECODER.items()
)
)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
with tempfile.TemporaryDirectory() as tmp_dir_4: with tempfile.TemporaryDirectory() as tmp_dir_4:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment