Unverified Commit dc3f6758 authored by Vasudev Gupta's avatar Vasudev Gupta Committed by GitHub
Browse files

Add BigBirdPegasus (#10991)



* init bigbird pegasus

* add debugging nb ; update config

* init conversion

* update conversion script

* complete conversion script

* init forward()

* complete forward()

* add tokenizer

* add some slow tests

* commit current

* fix copies

* add docs

* add conversion script for bigbird-roberta-summarization

* remove TODO

* small fixups

* correct tokenizer

* add bigbird core for now

* fix config

* fix more

* revert pegasus-tokenizer back

* make style

* everything working for pubmed; yayygit status

* complete tests finally

* remove bigbird pegasus tok

* correct tokenizer

* correct tests

* add tokenizer files

* finish make style

* fix test

* update

* make style

* fix tok utils base file

* make fix-copies

* clean a bit

* small update

* fix some suggestions

* add to readme

* fix a bit, clean tests

* fix more tests

* Update src/transformers/__init__.py

* Update src/transformers/__init__.py

* make fix-copies

* complete attn switching, auto-padding left

* make style

* fix auto-padding test

* make style

* fix batched attention tests

* put tolerance at 1e-1 for stand-alone decoder test

* fix docs

* fix tests

* correct slow tokenizer conversion

* Apply suggestions from code review
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* complete remaining suggestions

* fix test
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 6f40e317
......@@ -55,7 +55,6 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
# TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
self.assertListEqual(py_ids, rust_ids)
def test_large_mask_tokens(self):
......@@ -96,3 +95,81 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
assert batch.attention_mask.shape == (2, 1024)
assert targets["input_ids"].shape == (2, 5)
assert len(batch) == 2 # input_ids, attention_mask.
@require_sentencepiece
@require_tokenizers
class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
# We have a SentencePiece fixture for testing
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
tokenizer.save_pretrained(self.tmpdirname)
@cached_property
def _large_tokenizer(self):
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer):
return ("This is a test", "This is a test")
def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
raw_input_str = "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s> <pad> <pad> <pad>"
rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
self.assertListEqual(py_ids, rust_ids)
@require_torch
def test_large_seq2seq_truncation(self):
src_texts = ["This is going to be way too long." * 1000, "short example"]
tgt_texts = ["not super long but more than 5 tokens", "tiny"]
batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
with self._large_tokenizer.as_target_tokenizer():
targets = self._large_tokenizer(
tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
)
assert batch.input_ids.shape == (2, 4096)
assert batch.attention_mask.shape == (2, 4096)
assert targets["input_ids"].shape == (2, 5)
assert len(batch) == 2 # input_ids, attention_mask.
def test_equivalence_to_orig_tokenizer(self):
"""
To run with original TF tokenizer:
!wget https://github.com/google-research/bigbird/raw/master/bigbird/vocab/pegasus.model
!pip install tensorflow-text
import tensorflow.compat.v2 as tf
import tensorflow_text as tft
VOCAB_FILE = "./pegasus.model"
tf.enable_v2_behavior()
test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(VOCAB_FILE, "rb").read())
tokenizer.tokenize(test_str)
"""
test_str = "This is an example string that is used to test the original TF implementation against the HF implementation"
token_ids = self._large_tokenizer(test_str).input_ids
self.assertListEqual(
token_ids,
[182, 117, 142, 587, 4211, 120, 117, 263, 112, 804, 109, 856, 25016, 3137, 464, 109, 26955, 3137, 1],
)
......@@ -35,6 +35,9 @@ PATH_TO_DOC = "docs/source"
# Being in this list is an exception and should **not** be the rule.
IGNORE_NON_TESTED = [
# models to ignore for not tested
"BigBirdPegasusEncoder", # Building part of bigger (tested) model.
"BigBirdPegasusDecoder", # Building part of bigger (tested) model.
"BigBirdPegasusDecoderWrapper", # Building part of bigger (tested) model.
"M2M100Encoder", # Building part of bigger (tested) model.
"M2M100Decoder", # Building part of bigger (tested) model.
"Speech2TextEncoder", # Building part of bigger (tested) model.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment