Remove deprecated (#8604)

* Remove old deprecated arguments Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr> * Remove needless imports * Fix tests Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>

Remove deprecated (#8604)
* Remove old deprecated arguments Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr> * Remove needless imports * Fix tests Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>
dd52804f · Sylvain Gugger · GitHub · 3095ee9d · dd52804f · dd52804f
Unverified Commit dd52804f authored Nov 17, 2020 by Sylvain Gugger Committed by GitHub Nov 17, 2020
20 changed files
--- a/examples/seq2seq/test_finetune_trainer.py
+++ b/examples/seq2seq/test_finetune_trainer.py
@@ -138,7 +138,7 @@ class TestFinetuneTrainer(TestCasePlus):
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
-            evaluate_during_training=True,
+            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
@@ -179,7 +179,7 @@ class TestFinetuneTrainer(TestCasePlus):
            --per_device_eval_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
-            --evaluate_during_training
+            --evaluation_strategy steps
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}

--- a/examples/token-classification/run_ner_old.py
+++ b/examples/token-classification/run_ner_old.py
@@ -254,7 +254,7 @@ def main():
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
+        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)
    # Evaluation
@@ -265,7 +265,7 @@ def main():
        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-        if trainer.is_world_master():
+        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -145,11 +145,11 @@ def squad_convert_example_to_features(
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
-        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
-        else tokenizer.max_len - tokenizer.max_len_single_sentence
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
    )
-    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):

--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -16,7 +16,6 @@
 import math
 import os
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -742,7 +741,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
        r"""
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
@@ -753,8 +751,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
            A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        Returns:
@@ -773,14 +769,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
            >>> sop_logits = outputs.sop_logits
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.albert(
@@ -898,23 +886,13 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.albert(

--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -15,7 +15,6 @@
 """PyTorch BART model, ported from the fairseq repo."""
 import math
 import random
-import warnings
 from typing import Dict, List, Optional, Tuple
 import numpy as np
@@ -529,7 +528,6 @@ class BartDecoder(nn.Module):
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
-        **unused,
    ):
        """
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
@@ -551,18 +549,6 @@ class BartDecoder(nn.Module):
                - hidden states
                - attentions
        """
-        if "decoder_cached_states" in unused:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_cached_states")
-        if "decoder_past_key_values" in unused:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_past_key_values")
        # check attention mask and invert
        if encoder_padding_mask is not None:
@@ -873,14 +859,7 @@ class BartModel(PretrainedBartModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
-        if "decoder_past_key_values" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_values")
        if decoder_input_ids is None:
            use_cache = False
@@ -1006,7 +985,6 @@ class BartForConditionalGeneration(PretrainedBartModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **unused,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1034,24 +1012,6 @@ class BartForConditionalGeneration(PretrainedBartModel):
            >>> tokenizer.decode(predictions).split()
            >>> # ['good', 'great', 'all', 'really', 'very']
        """
-        if "lm_labels" in unused:
-            warnings.warn(
-                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = unused.pop("lm_labels")
-        if "decoder_cached_states" in unused:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_cached_states")
-        if "decoder_past_key_values" in unused:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_past_key_values")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:

--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -896,7 +896,6 @@ class BertForPreTraining(BertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
@@ -928,13 +927,6 @@ class BertForPreTraining(BertPreTrainedModel):
            >>> prediction_logits = outputs.prediction_logits
            >>> seq_relationship_logits = outputs.seq_relationship_logits
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.bert(
@@ -1136,24 +1128,13 @@ class BertForMaskedLM(BertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -15,9 +15,6 @@
 # limitations under the License.
 """ PyTorch CTRL model."""
-import warnings
 import numpy as np
 import torch
 import torch.nn as nn
@@ -369,15 +366,7 @@ class CTRLModel(CTRLPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
-        if "past" in kwargs:
-            warnings.warn(
-                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("past")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -542,7 +531,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -550,13 +538,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
        """
-        if "past" in kwargs:
-            warnings.warn(
-                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("past")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(

--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -20,7 +20,6 @@
 import copy
 import math
-import warnings
 import numpy as np
 import torch
@@ -526,23 +525,13 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        dlbrt_output = self.distilbert(

--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -16,7 +16,6 @@
 import math
 import os
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -1000,23 +999,13 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        generator_hidden_states = self.electra(

--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -29,7 +29,6 @@
 import math
 import random
-import warnings
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -618,7 +617,6 @@ class FSMTDecoder(nn.Module):
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
-        **unused,
    ):
        """
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
@@ -640,19 +638,6 @@ class FSMTDecoder(nn.Module):
                - hidden states
                - attentions
        """
-        if "decoder_cached_states" in unused:
-            warnings.warn(
-                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_cached_states")
-        if "decoder_past_key_values" in unused:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = unused.pop("decoder_past_key_values")
        # check attention mask and invert
        if encoder_padding_mask is not None:
            encoder_padding_mask = invert_mask(encoder_padding_mask)
@@ -933,15 +918,7 @@ class FSMTModel(PretrainedFSMTModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
-        if "decoder_past_key_values" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_values")
        if decoder_input_ids is None:
            use_cache = False
@@ -1071,7 +1048,6 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **unused,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):

--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -16,7 +16,6 @@
 """PyTorch OpenAI GPT-2 model."""
 import os
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -528,16 +527,7 @@ class GPT2Model(GPT2PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
-        if "past" in kwargs:
-            warnings.warn(
-                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("past")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -758,7 +748,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -766,13 +755,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
        """
-        if "past" in kwargs:
-            warnings.warn(
-                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("past")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(
@@ -900,8 +882,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
            `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        Return:
@@ -930,19 +910,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            >>> mc_logits = outputs.mc_logits
        """
-        if "lm_labels" in kwargs:
-            warnings.warn(
-                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("lm_labels")
-        if "past" in kwargs:
-            warnings.warn(
-                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("past")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(

--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -17,7 +17,6 @@
 import json
 import os
-import warnings
 from functools import lru_cache
 from typing import Optional, Tuple
@@ -293,13 +292,6 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        return vocab_file, merge_file
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if is_split_into_words or add_prefix_space:
            text = " " + text

--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -16,7 +16,6 @@
 import json
-import warnings
 from typing import Optional, Tuple
 from tokenizers import pre_tokenizers
@@ -151,13 +150,6 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        self.add_prefix_space = add_prefix_space
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
@@ -167,14 +159,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        return super()._batch_encode_plus(*args, **kwargs)
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-        else:
-            is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "

--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -15,7 +15,6 @@
 """PyTorch Longformer model. """
 import math
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -1509,7 +1508,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1538,14 +1536,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
            >>> loss = outputs.loss
            >>> prediction_logits = output.logits
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.longformer(

--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -1109,7 +1109,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1119,12 +1118,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.mobilebert(

--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -19,7 +19,6 @@
 import json
 import math
 import os
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -645,7 +644,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
@@ -659,8 +657,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
            `input_ids` above)
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        Return:
@@ -683,13 +679,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            >>> mc_logits = outputs.mc_logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if "lm_labels" in kwargs:
-            warnings.warn(
-                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        transformer_outputs = self.transformer(
            input_ids,

--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -302,7 +302,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        **kwargs,
    ) -> BatchEncoding:
        if max_length is None:
-            max_length = self.max_len
+            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,

--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -16,7 +16,6 @@
 """PyTorch RoBERTa model. """
 import math
-import warnings
 import torch
 import torch.nn as nn
@@ -872,7 +871,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -882,13 +880,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
-        if "masked_lm_labels" in kwargs:
-            warnings.warn(
-                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("masked_lm_labels")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.roberta(

--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
-import warnings
 from typing import List, Optional
 from ...tokenization_utils import AddedToken
@@ -251,13 +250,6 @@ class RobertaTokenizer(GPT2Tokenizer):
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -18,7 +18,6 @@
 import copy
 import math
 import os
-import warnings
 import torch
 import torch.nn.functional as F
@@ -1048,7 +1047,6 @@ class T5Model(T5PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
        r"""
        Returns:
@@ -1066,20 +1064,6 @@ class T5Model(T5PreTrainedModel):
            >>> last_hidden_states = outputs.last_hidden_state
        """
-        if "decoder_past_key_value_states" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_value_states")
-        if "decoder_past_key_values" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_values")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1198,15 +1182,12 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
-        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
            labels in ``[0, ..., config.vocab_size]``
-        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
        Returns:
@@ -1226,27 +1207,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model.generate(input_ids)
        """
-        if "lm_labels" in kwargs:
-            warnings.warn(
-                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
-                FutureWarning,
-            )
-            labels = kwargs.pop("lm_labels")
-        if "decoder_past_key_value_states" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_value_states")
-        if "decoder_past_key_values" in kwargs:
-            warnings.warn(
-                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                FutureWarning,
-            )
-            past_key_values = kwargs.pop("decoder_past_key_values")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict