Unverified Commit 5abe5038 authored by Sam Shleifer's avatar Sam Shleifer Committed by GitHub
Browse files

Fix #6096: MBartTokenizer's mask token (#6098)

parent b1c8b769
...@@ -122,6 +122,7 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -122,6 +122,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
} }
self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()} self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
self.cur_lang_code = self.lang_code_to_id["en_XX"] self.cur_lang_code = self.lang_code_to_id["en_XX"]
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
self.fairseq_tokens_to_ids.update(self.lang_code_to_id) self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
......
...@@ -123,6 +123,7 @@ class MBartEnroIntegrationTest(AbstractMBartIntegrationTest): ...@@ -123,6 +123,7 @@ class MBartEnroIntegrationTest(AbstractMBartIntegrationTest):
self.assertEqual(logits.shape, expected_shape) self.assertEqual(logits.shape, expected_shape)
@require_torch
class MBartCC25IntegrationTest(AbstractMBartIntegrationTest): class MBartCC25IntegrationTest(AbstractMBartIntegrationTest):
checkpoint_name = "facebook/mbart-large-cc25" checkpoint_name = "facebook/mbart-large-cc25"
src_text = [ src_text = [
...@@ -140,3 +141,14 @@ class MBartCC25IntegrationTest(AbstractMBartIntegrationTest): ...@@ -140,3 +141,14 @@ class MBartCC25IntegrationTest(AbstractMBartIntegrationTest):
) )
decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True) decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
self.assertEqual(self.tgt_text[0], decoded[0]) self.assertEqual(self.tgt_text[0], decoded[0])
@slow
def test_fill_mask(self):
inputs = self.tokenizer.prepare_translation_batch(["One of the best <mask> I ever read!"]).to(torch_device)
outputs = self.model.generate(
inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1
)
prediction: str = self.tokenizer.batch_decode(
outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
)[0]
self.assertEqual(prediction, "of the best books I ever read!")
import tempfile
import unittest import unittest
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer
...@@ -171,3 +172,13 @@ class MBartEnroIntegrationTest(unittest.TestCase): ...@@ -171,3 +172,13 @@ class MBartEnroIntegrationTest(unittest.TestCase):
self.assertEqual(ids[-2], 2) self.assertEqual(ids[-2], 2)
self.assertEqual(ids[-1], EN_CODE) self.assertEqual(ids[-1], EN_CODE)
self.assertEqual(len(ids), desired_max_length) self.assertEqual(len(ids), desired_max_length)
def test_mask_token(self):
self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [250026, 250001])
def test_special_tokens_unaffacted_by_save_load(self):
tmpdirname = tempfile.mkdtemp()
original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
self.tokenizer.save_pretrained(tmpdirname)
new_tok = MBartTokenizer.from_pretrained(tmpdirname)
self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment