Unverified Commit 8ad06b7c authored by Pengfei Liu's avatar Pengfei Liu Committed by GitHub
Browse files

using raw string for regex to search <extra_id> (#21162)

* using raw string for regex to search <extra_id>

* fix the same issue in test file:`tokenization_t5.py`
parent 8a17da2f
......@@ -214,7 +214,7 @@ class T5Tokenizer(PreTrainedTokenizer):
def get_sentinel_tokens(self):
return list(
set(filter(lambda x: bool(re.search("<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)) is not None, self.additional_special_tokens))
)
def get_sentinel_token_ids(self):
......
......@@ -386,7 +386,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
sentinel_tokens = tokenizer.get_sentinel_tokens()
self.assertEquals(len(sentinel_tokens), 10)
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
def test_get_sentinel_token_ids(self):
tokenizer = T5Tokenizer(SAMPLE_VOCAB, extra_ids=10)
......@@ -397,7 +397,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
sentinel_tokens = tokenizer.get_sentinel_tokens()
self.assertEquals(len(sentinel_tokens), 10)
self.assertListEqual(sorted(sentinel_tokens), sorted([f"<extra_id_{str(i)}>" for i in range(0, 10)]))
self.assertTrue([re.search("<extra_id_\d+>", token) is not None for token in sentinel_tokens])
self.assertTrue([re.search(r"<extra_id_\d+>", token) is not None for token in sentinel_tokens])
def test_get_sentinel_token_ids_for_fasttokenizer(self):
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment