"lib/nicbm/nicbm.cc" did not exist on "ea6fd0f9255edde9da935a1e451cb782ab24fbdc"
Unverified Commit eef66035 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[PyTorch Bart] Split Bart into different models (#9343)

* first try

* remove old template

* finish bart

* finish mbart

* delete unnecessary line

* init pegasus

* save intermediate

* correct pegasus

* finish pegasus

* remove cookie cutter leftover

* add marian

* finish blenderbot

* replace in file

* correctly split blenderbot

* delete "old" folder

* correct "add statement"

* adapt config for tf comp

* correct configs for tf

* remove ipdb

* fix more stuff

* fix mbart

* push pegasus fix

* fix mbart

* more fixes

* fix research projects code

* finish docs for bart, mbart, and marian

* delete unnecessary file

* correct attn typo

* correct configs

* remove pegasus for seq class

* correct peg docs

* correct peg docs

* finish configs

* further improve docs

* add copied from statements to mbart

* fix copied from in mbart

* add copy statements to marian

* add copied from to marian

* add pegasus copied from

* finish pegasus

* finish copied from

* Apply suggestions from code review

* make style

* backward comp blenderbot

* apply lysandres and sylvains suggestions

* apply suggestions

* push last fixes

* fix docs

* fix tok tests

* fix imports code style

* fix doc
parent 4eec5d0c
......@@ -1028,14 +1028,22 @@ T5_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for sequence to sequence training. T5 uses the :obj:`pad_token_id` as the starting token for
:obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
:obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BartTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
:obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
:obj:`past_key_values`).
To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
<./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
:obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
also be used by default.
encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
......
......@@ -922,7 +922,7 @@ T5_INPUTS_DOCSTRING = r"""
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
also be used by default.
encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`):
......
......@@ -609,6 +609,27 @@ class BlenderbotModel:
requires_pytorch(self)
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = None
class BlenderbotSmallForConditionalGeneration:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class BlenderbotSmallModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......@@ -1327,6 +1348,15 @@ class LxmertXLayer:
requires_pytorch(self)
class MarianModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class MarianMTModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
......@@ -1345,6 +1375,24 @@ class MBartForConditionalGeneration:
requires_pytorch(self)
class MBartForQuestionAnswering:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class MBartForSequenceClassification:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_pytorch(self)
class MBartModel:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
......
......@@ -135,6 +135,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
>>> configuration = model.config
"""
model_type = "{{cookiecutter.lowercase_modelname}}"
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
keys_to_ignore_at_inference = ["past_key_values"]
{% endif -%}
def __init__(
self,
......
......@@ -1849,7 +1849,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the input_ids right, following the paper.
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
......
......@@ -480,8 +480,6 @@ import copy
import tempfile
import unittest
import timeout_decorator # noqa
from transformers import is_torch_available
from transformers.file_utils import cached_property
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -246,16 +246,16 @@ TOLERANCE = 1e-4
class TFBartModelIntegrationTest(unittest.TestCase):
def test_inference_no_head(self):
model = TFBartModel.from_pretrained("facebook/bart-large", from_pt=True)
input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
# with torch.no_grad():
output = model(**inputs_dict)[0]
attention_mask = tf.cast(tf.math.not_equal(input_ids, model.config.pad_token_id), tf.int8)
output = model(input_ids=input_ids, attention_mask=attention_mask)[0]
expected_shape = (1, 11, 1024)
self.assertEqual(output.shape, expected_shape)
expected_slice = tf.Tensor(
expected_slice = tf.convert_to_tensor(
[[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
)
self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))
tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
def test_cnn_summarization_same_as_fairseq_hard(self):
hf = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn", from_pt=True)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment