[XLNet] Fix mems behavior (#8567)

* fix mems in xlnet * fix use_mems * fix use_mem_len * fix use mems * clean docs * fix tf typo * make xlnet tf for generation work * fix tf test * refactor use cache * add use cache for missing models * correct use_cache in generate * correct use cache in tf generate * fix tf * correct getattr typo * make sylvain happy * change in docs as well * do not apply to cookie cutter statements * fix tf test * make pytorch model fully backward compatible

[XLNet] Fix mems behavior (#8567)
* fix mems in xlnet * fix use_mems * fix use_mem_len * fix use mems * clean docs * fix tf typo * make xlnet tf for generation work * fix tf test * refactor use cache * add use cache for missing models * correct use_cache in generate * correct use cache in tf generate * fix tf * correct getattr typo * make sylvain happy * change in docs as well * do not apply to cookie cutter statements * fix tf test * make pytorch model fully backward compatible
2a6fbe6a · Patrick von Platen · GitHub · 369f1d77 · 2a6fbe6a · 2a6fbe6a
Unverified Commit 2a6fbe6a authored Nov 25, 2020 by Patrick von Platen Committed by GitHub Nov 25, 2020
20 changed files
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -809,7 +809,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
 @add_start_docstrings(
    """
-    Albert Model with two heads on top for pre-training: a `masked language modeling` head and a `sentence order
+    Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order
    prediction` (classification) head.
    """,
    ALBERT_START_DOCSTRING,

--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -108,6 +108,8 @@ class BartConfig(PretrainedConfig):
        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
            :obj:`True` for `bart-large-cnn`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    """
    model_type = "bart"
    keys_to_ignore_at_inference = ["past_key_values"]
@@ -134,9 +136,6 @@ class BartConfig(PretrainedConfig):
        classifier_dropout=0.0,
        num_labels=3,
        is_encoder_decoder=True,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
        normalize_before=False,
        add_final_layer_norm=False,
        do_blenderbot_90_layernorm=False,
@@ -145,6 +144,10 @@ class BartConfig(PretrainedConfig):
        static_position_embeddings=False,
        add_bias_logits=False,
        force_bos_token_to_be_generated=False,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
        **common_kwargs
    ):
        r"""
@@ -208,6 +211,8 @@ class BartConfig(PretrainedConfig):
        self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
+        self.use_cache = use_cache
    @property
    def num_attention_heads(self) -> int:
        return self.encoder_attention_heads

--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -888,7 +888,7 @@ class BertModel(BertPreTrainedModel):
 @add_start_docstrings(
    """
-    Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    """,
    BERT_START_DOCSTRING,

--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -90,7 +90,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFBertPreTrainingLoss:
    """
-    Loss function suitable for BERT-like pre-training, that is, the task of pretraining a language model by combining
+    Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
    NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
    computation.
    """
@@ -878,7 +878,7 @@ class TFBertModel(TFBertPreTrainedModel):
 @add_start_docstrings(
    """
-Bert Model with two heads on top as done during the pre-training:
+Bert Model with two heads on top as done during the pretraining:
    a `masked language modeling` head and a `next sentence prediction (classification)` head.
    """,
    BERT_START_DOCSTRING,

--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -80,7 +80,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
        normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
            Whether or not to apply a normalization preprocess.
        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
            .. note::

--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -61,6 +61,9 @@ class CTRLConfig(PretrainedConfig):
            The epsilon to use in the layer normalization layers
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    Examples::
@@ -98,6 +101,7 @@ class CTRLConfig(PretrainedConfig):
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
+        use_cache=True,
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -119,6 +123,7 @@ class CTRLConfig(PretrainedConfig):
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
+        self.use_cache = use_cache
    @property
    def max_position_embeddings(self):

--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
    The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
    <https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
-    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to

--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -891,8 +891,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
 @add_start_docstrings(
    """
-    Electra model with a binary classification head on top as used during pre-training for identifying generated
+    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
-    tokens.
    It is recommended to load the discriminator checkpoint into that model.
    """,

--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -789,8 +789,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
 @add_start_docstrings(
    """
-    Electra model with a binary classification head on top as used during pre-training for identifying generated
+    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.
-    tokens.
    Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model
    of the two to have the correct classification head to be used for this model.

--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -109,6 +109,8 @@ class FSMTConfig(PretrainedConfig):
        early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
            Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
            search when at least ``num_beams`` sentences are finished per batch or not.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
        Examples::
@@ -142,9 +144,6 @@ class FSMTConfig(PretrainedConfig):
        dropout=0.1,
        activation_dropout=0.0,
        init_std=0.02,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
        decoder_start_token_id=2,
        is_encoder_decoder=True,
        scale_embedding=True,
@@ -152,6 +151,10 @@ class FSMTConfig(PretrainedConfig):
        num_beams=5,
        length_penalty=1.0,
        early_stopping=False,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
        **common_kwargs
    ):
        if "hidden_size" in common_kwargs:
@@ -196,6 +199,8 @@ class FSMTConfig(PretrainedConfig):
        self.activation_dropout = activation_dropout
        self.dropout = dropout
+        self.use_cache = use_cache
    @property
    def num_attention_heads(self) -> int:
        return self.encoder_attention_heads

--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -1241,7 +1241,7 @@ class TFFunnelModel(TFFunnelPreTrainedModel):
 @add_start_docstrings(
    """
-    Funnel model with a binary classification head on top as used during pre-training for identifying generated tokens.
+    Funnel model with a binary classification head on top as used during pretraining for identifying generated tokens.
    """,
    FUNNEL_START_DOCSTRING,
 )

--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -104,6 +104,8 @@ class GPT2Config(PretrainedConfig):
            The dropout ratio to be used after the projection and activation.
        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    Example::
@@ -142,9 +144,10 @@ class GPT2Config(PretrainedConfig):
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
+        gradient_checkpointing=False,
+        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
-        gradient_checkpointing=False,
        **kwargs
    ):
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -168,6 +171,7 @@ class GPT2Config(PretrainedConfig):
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
        self.gradient_checkpointing = gradient_checkpointing
+        self.use_cache = use_cache
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -1013,7 +1013,7 @@ class LxmertModel(LxmertPreTrainedModel):
 @add_start_docstrings(
-    """Lxmert Model with a specified pre-training head on top. """,
+    """Lxmert Model with a specified pretraining head on top. """,
    LXMERT_START_DOCSTRING,
 )
 class LxmertForPreTraining(LxmertPreTrainedModel):
@@ -1024,7 +1024,7 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
        self.num_qa_labels = config.num_qa_labels
        self.visual_loss_normalizer = config.visual_loss_normalizer
-        # Use of pre-training tasks
+        # Use of pretraining tasks
        self.task_mask_lm = config.task_mask_lm
        self.task_obj_predict = config.task_obj_predict
        self.task_matched = config.task_matched

--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -1176,7 +1176,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
        self.num_qa_labels = config.num_qa_labels
        self.visual_loss_normalizer = config.visual_loss_normalizer
-        # Use of pre-training tasks
+        # Use of pretraining tasks
        self.task_mask_lm = config.task_mask_lm
        self.task_obj_predict = config.task_obj_predict
        self.task_matched = config.task_matched

--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -933,7 +933,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
 @add_start_docstrings(
    """
-    MobileBert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
+    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    """,
    MOBILEBERT_START_DOCSTRING,

--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1014,7 +1014,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
 @add_start_docstrings(
    """
-    MobileBert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
+    MobileBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    """,
    MOBILEBERT_START_DOCSTRING,

--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -96,6 +96,9 @@ class OpenAIGPTConfig(PretrainedConfig):
            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
            The dropout ratio to be used after the projection and activation.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    Examples::
@@ -133,6 +136,7 @@ class OpenAIGPTConfig(PretrainedConfig):
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
+        use_cache=True,
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -155,6 +159,7 @@ class OpenAIGPTConfig(PretrainedConfig):
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
+        self.use_cache = use_cache
    @property
    def max_position_embeddings(self):

--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -90,6 +90,8 @@ class ProphetNetConfig(PretrainedConfig):
        eps (:obj:`float`, `optional`, defaults to 0.0):
            Controls the ``epsilon`` parameter value for label smoothing in the loss calculation. If set to 0, no label
            smoothing is performed.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    """
    model_type = "prophetnet"
    keys_to_ignore_at_inference = ["past_key_values"]
@@ -112,15 +114,16 @@ class ProphetNetConfig(PretrainedConfig):
        init_std=0.02,
        is_encoder_decoder=True,
        add_cross_attention=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
        decoder_start_token_id=0,
        ngram=2,
        num_buckets=32,
        relative_max_distance=128,
        disable_ngram_loss=False,
        eps=0.0,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
        **kwargs
    ):
        super().__init__(
@@ -156,6 +159,8 @@ class ProphetNetConfig(PretrainedConfig):
        self.activation_dropout = activation_dropout
        self.dropout = dropout
+        self.use_cache = use_cache
    @property
    def num_attention_heads(self) -> int:
        return self.num_encoder_attention_heads

--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -72,6 +72,8 @@ RAG_CONFIG_DOC = r"""
        output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`):
            If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and
            :obj:`context_attention_mask` are returned. See returned tensors for more detail.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
 """
@@ -107,6 +109,7 @@ class RagConfig(PretrainedConfig):
        exclude_bos_score=False,
        do_marginalize=False,
        output_retrieved=False,
+        use_cache=True,
        **kwargs
    ):
        super().__init__(
@@ -156,6 +159,8 @@ class RagConfig(PretrainedConfig):
        self.do_deduplication = do_deduplication
+        self.use_cache = use_cache
    @classmethod
    def from_question_encoder_generator_configs(
        cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs

--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -138,6 +138,8 @@ class ReformerConfig(PretrainedConfig):
            :obj:`inputs_ids` passed when calling :class:`~transformers.ReformerModel`.
        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to tie input and output embeddings.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
    Examples::
@@ -188,6 +190,7 @@ class ReformerConfig(PretrainedConfig):
        pad_token_id=0,
        vocab_size=320,
        tie_word_embeddings=False,
+        use_cache=True,
        **kwargs
    ):
        super().__init__(
@@ -226,3 +229,4 @@ class ReformerConfig(PretrainedConfig):
        self.axial_norm_std = axial_norm_std
        self.chunk_size_lm_head = chunk_size_lm_head
        self.attn_layers = attn_layers
+        self.use_cache = use_cache