Unverified Commit ecfe9be7 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

[UDOP] Add special tokens to tokenizer (#29594)

* Add special tokens

* Add special tokens

* Use fmt

* Uncomment code

* Add test

* Remove scripts

* Address comments

* Improve tests

* Address comment

* Remove flag
parent d9850abd
...@@ -1893,3 +1893,31 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -1893,3 +1893,31 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(encoding_p["attention_mask"], [1, 1, 1]) self.assertListEqual(encoding_p["attention_mask"], [1, 1, 1])
self.assertDictEqual(dict(encoding_p), dict(encoding_r)) self.assertDictEqual(dict(encoding_p), dict(encoding_r))
self.assertEqual(tokenizer_p.decode(encoding_p["input_ids"]), expected_decoding) self.assertEqual(tokenizer_p.decode(encoding_p["input_ids"]), expected_decoding)
def test_special_tokens(self):
tokenizer_p = UdopTokenizer.from_pretrained("microsoft/udop-large")
tokenizer_r = UdopTokenizerFast.from_pretrained("microsoft/udop-large")
# encode
text = "paragraph<loc_58>. Hey"
encoding_p = tokenizer_p.encode(text)
encoding_r = tokenizer_r.encode(text)
assert encoding_p == encoding_r == [8986, 32942, 3, 5, 9459, 1]
# decode
# this is different between slow/fast tokenizer
# due tothe former having `spaces_between_special_tokens=True` by default
ids = [0, 8986, 32942, 32966, 32554, 32551, 1]
# test slow tokenizer
decoding = tokenizer_p.decode(ids, spaces_between_special_tokens=False)
excepted_decoding = "<pad>paragraph<loc_58><loc_34><loc_446><loc_449></s>"
assert decoding == excepted_decoding
# test fast tokenizer
decoding = tokenizer_r.decode(ids)
excepted_decoding = "<pad> paragraph<loc_58><loc_34><loc_446><loc_449></s>"
assert decoding == excepted_decoding
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment