Remove deprecated (#8604)

* Remove old deprecated arguments Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr> * Remove needless imports * Fix tests Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>

Remove deprecated (#8604)
* Remove old deprecated arguments Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr> * Remove needless imports * Fix tests Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>
dd52804f · Sylvain Gugger · GitHub · 3095ee9d · dd52804f · dd52804f
Unverified Commit dd52804f authored Nov 17, 2020 by Sylvain Gugger Committed by GitHub Nov 17, 2020
17 changed files
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -595,7 +595,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        output_attentions=None,
        output_hidden_states=None,
        training=False,
-        **kwargs,
    ) -> Tuple:
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -621,21 +620,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            output_attentions = inputs.get("output_attentions", output_attentions)
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            assert len(inputs) <= 10, "Too many inputs."
-
-            if "past_key_values" in inputs:
-                warnings.warn(
-                    "The `past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = inputs.pop("past_key_values")
        else:
            input_ids = inputs
-            if "past_key_values" in kwargs:
-                warnings.warn(
-                    "The `past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = kwargs.pop("past_key_values")

        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
@@ -1078,23 +1064,9 @@ class TFT5Model(TFT5PreTrainedModel):
            output_attentions = inputs.get("output_attentions", output_attentions)
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            assert len(inputs) <= 13, "Too many inputs."
-
-            if "past_key_value_states" in inputs:
-                warnings.warn(
-                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = inputs.pop("past_key_value_states")
        else:
            input_ids = inputs

-            if "past_key_value_states" in kwargs:
-                warnings.warn(
-                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = kwargs.pop("past_key_value_states")
-
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions else self.config.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
@@ -1294,23 +1266,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            return_dict = inputs.get("return_dict", return_dict)
            assert len(inputs) <= 14, "Too many inputs."
-
-            if "past_key_value_states" in inputs:
-                warnings.warn(
-                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = inputs.pop("past_key_value_states")
        else:
            input_ids = inputs

-            if "past_key_value_states" in kwargs:
-                warnings.warn(
-                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
-                    FutureWarning,
-                )
-                past_key_values = kwargs.pop("past_key_value_states")
-
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions else self.config.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states

--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -15,9 +15,6 @@
 # limitations under the License.
 """ Transformer XL configuration """

-
-import warnings
-
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging

@@ -139,13 +136,6 @@ class TransfoXLConfig(PretrainedConfig):
        eos_token_id=0,
        **kwargs
    ):
-        if "tie_weight" in kwargs:
-            warnings.warn(
-                "The config parameter `tie_weight` is deprecated. Please use `tie_word_embeddings` instead.",
-                FutureWarning,
-            )
-            kwargs["tie_word_embeddings"] = kwargs["tie_weight"]
-
        super().__init__(eos_token_id=eos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.cutoffs = []

--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -16,7 +16,6 @@
 """
 TF 2.0 Transformer XL model.
 """
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple

@@ -865,13 +864,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            return self.crit.out_layers[-1]
        return None

-    def reset_length(self, tgt_len, ext_len, mem_len):
-        warnings.warn(
-            "The method `reset_length` is deprecated and will be removed in a future version, use `reset_memory_length` instead.",
-            FutureWarning,
-        )
-        self.transformer.reset_memory_length(mem_len)
-
    def reset_memory_length(self, mem_len):
        self.transformer.reset_memory_length(mem_len)


--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -17,7 +17,6 @@
 PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
 https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
 """
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple

@@ -1010,13 +1009,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
                    else:
                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]

-    def reset_length(self, tgt_len, ext_len, mem_len):
-        warnings.warn(
-            "The method `reset_length` is deprecated and will be removed in a future version, use `reset_memory_length` instead.",
-            FutureWarning,
-        )
-        self.transformer.reset_memory_length(mem_len)
-
    def reset_memory_length(self, mem_len):
        self.transformer.reset_memory_length(mem_len)


--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -16,9 +16,7 @@
 TF 2.0 XLM model.
 """

-
 import itertools
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple

@@ -997,10 +995,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        )

        if lengths is not None:
-            warnings.warn(
+            logger.warn(
                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
                "attention mask instead.",
-                FutureWarning,
            )
            lengths = None


--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -16,10 +16,8 @@
 PyTorch XLM model.
 """

-
 import itertools
 import math
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple

@@ -1228,10 +1226,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
        )

        if lengths is not None:
-            warnings.warn(
+            logger.warn(
                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
-                "attention mask instead.",
-                FutureWarning,
+                "attention mask instead."
            )
            lengths = None


--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1182,7 +1182,6 @@ class FillMaskPipeline(Pipeline):
        device: int = -1,
        top_k=5,
        task: str = "",
-        **kwargs
    ):
        super().__init__(
            model=model,
@@ -1196,14 +1195,6 @@ class FillMaskPipeline(Pipeline):
        )

        self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING)
-
-        if "topk" in kwargs:
-            warnings.warn(
-                "The `topk` argument is deprecated and will be removed in a future version, use `top_k` instead.",
-                FutureWarning,
-            )
-            self.top_k = kwargs.pop("topk")
-        else:
        self.top_k = top_k

    def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -19,7 +19,6 @@
 import itertools
 import re
 import unicodedata
-import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union, overload

 from .file_utils import add_end_docstrings
@@ -246,12 +245,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        Returns:
            :obj:`List[str]`: The list of tokens.
        """
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            kwargs["is_split_into_words"] = kwargs.pop("is_pretokenized")
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
        all_special_tokens_extended = dict(
            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
@@ -448,13 +441,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                "https://github.com/huggingface/transformers/pull/2674"
            )

-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
        first_ids = get_input_ids(text)
        second_ids = get_input_ids(text_pair) if text_pair is not None else None

@@ -530,13 +516,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                "transformers.PreTrainedTokenizerFast."
            )

-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
            if not isinstance(ids_or_pair_ids, (list, tuple)):

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1532,18 +1532,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        super().__init__(**kwargs)

-    @property
-    def max_len(self) -> int:
-        """
-        :obj:`int`: **Deprecated** Kept here for backward compatibility. Now renamed to :obj:`model_max_length` to
-        avoid ambiguity.
-        """
-        warnings.warn(
-            "The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.",
-            FutureWarning,
-        )
-        return self.model_max_length
-
    @property
    def max_len_single_sentence(self) -> int:
        """
@@ -2785,15 +2773,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                and ``convert_tokens_to_ids`` methods.
        """

-        if "return_lengths" in kwargs:
-            if verbose:
-                warnings.warn(
-                    "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. "
-                    "Please use `return_length` instead.",
-                    FutureWarning,
-                )
-            return_length = kwargs["return_lengths"]
-
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,

--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -19,7 +19,6 @@

 import json
 import os
-import warnings
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union

@@ -357,7 +356,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
-        **kwargs
    ) -> BatchEncoding:

        if not isinstance(batch_text_or_text_pairs, list):
@@ -365,16 +363,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
                "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
            )

-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
-        if kwargs:
-            raise ValueError(f"Keyword arguments {kwargs} not recognized.")
-
        # Set the truncation and padding strategy and restore the initial configuration
        self.set_truncation_and_padding(
            padding_strategy=padding_strategy,
@@ -453,12 +441,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")

        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_output = self._batch_encode_plus(

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -213,8 +213,6 @@ class Trainer:
            containing the optimizer and the scheduler to use. Will default to an instance of
            :class:`~transformers.AdamW` on your model and a scheduler given by
            :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
-        kwargs:
-            Deprecated keyword arguments.
    """

    def __init__(
@@ -229,7 +227,6 @@ class Trainer:
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        callbacks: Optional[List[TrainerCallback]] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-        **kwargs,
    ):
        if args is None:
            logger.info("No `TrainingArguments` passed, using the current path as `output_dir`.")
@@ -262,27 +259,6 @@ class Trainer:
        self.callback_handler = CallbackHandler(callbacks, self.model, self.optimizer, self.lr_scheduler)
        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)

-        # Deprecated arguments
-        if "tb_writer" in kwargs:
-            warnings.warn(
-                "Passing `tb_writer` as a keyword argument is deprecated and won't be possible in a "
-                + "future version. Use `TensorBoardCallback(tb_writer=...)` instead and pass it to the `callbacks`"
-                + "argument",
-                FutureWarning,
-            )
-            tb_writer = kwargs.pop("tb_writer")
-            self.remove_callback(TensorBoardCallback)
-            self.add_callback(TensorBoardCallback(tb_writer=tb_writer))
-        if "prediction_loss_only" in kwargs:
-            warnings.warn(
-                "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a "
-                + "future version. Use `args.prediction_loss_only` instead. Setting "
-                + f"`args.prediction_loss_only={kwargs['prediction_loss_only']}",
-                FutureWarning,
-            )
-            self.args.prediction_loss_only = kwargs.pop("prediction_loss_only")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
-
        # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
        self._loggers_initialized = False

@@ -294,14 +270,7 @@ class Trainer:
            # We'll find a more elegant and not need to do this in the future.
            self.model.config.xla_device = True
        if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
-            self.data_collator = self.data_collator.collate_batch
-            warnings.warn(
-                (
-                    "The `data_collator` should now be a simple callable (function, class with `__call__`), classes "
-                    + "with a `collate_batch` are deprecated and won't be supported in a future version."
-                ),
-                FutureWarning,
-            )
+            raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).")

        if args.max_steps > 0:
            logger.info("max_steps is given, it will override any value given in num_train_epochs")
@@ -1050,12 +1019,6 @@ class Trainer:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
-        if hasattr(self, "_log"):
-            warnings.warn(
-                "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.",
-                FutureWarning,
-            )
-            return self._log(logs)
        if self.state.epoch is not None:
            logs["epoch"] = self.state.epoch

@@ -1095,12 +1058,6 @@ class Trainer:
        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """
-        if hasattr(self, "_training_step"):
-            warnings.warn(
-                "The `_training_step` method is deprecated and won't be called in a future version, define `training_step` in your subclass.",
-                FutureWarning,
-            )
-            return self._training_step(model, inputs, self.optimizer)

        model.train()
        inputs = self._prepare_inputs(inputs)
@@ -1140,18 +1097,6 @@ class Trainer:
        # We don't use .loss here since the model may return tuples instead of ModelOutput.
        return outputs[0]

-    def is_local_master(self) -> bool:
-        """
-        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
-        machines) main process.
-
-        .. warning::
-
-            This method is deprecated, use :meth:`~transformers.Trainer.is_local_process_zero` instead.
-        """
-        warnings.warn("This method is deprecated, use `Trainer.is_local_process_zero()` instead.", FutureWarning)
-        return self.is_local_process_zero()
-
    def is_local_process_zero(self) -> bool:
        """
        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
@@ -1162,18 +1107,6 @@ class Trainer:
        else:
            return self.args.local_rank in [-1, 0]

-    def is_world_master(self) -> bool:
-        """
-        Whether or not this process is the global main process (when training in a distributed fashion on several
-        machines, this is only going to be :obj:`True` for one process).
-
-        .. warning::
-
-            This method is deprecated, use :meth:`~transformers.Trainer.is_world_process_zero` instead.
-        """
-        warnings.warn("This method is deprecated, use `Trainer.is_world_process_zero()` instead.", FutureWarning)
-        return self.is_world_process_zero()
-
    def is_world_process_zero(self) -> bool:
        """
        Whether or not this process is the global main process (when training in a distributed fashion on several
@@ -1362,13 +1295,6 @@ class Trainer:

        Works both with or without labels.
        """
-        if hasattr(self, "_prediction_loop"):
-            warnings.warn(
-                "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.",
-                FutureWarning,
-            )
-            return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only)
-
        if not isinstance(dataloader.dataset, collections.abc.Sized):
            raise ValueError("dataset must implement __len__")
        prediction_loss_only = (

--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -3,7 +3,6 @@
 import datetime
 import math
 import os
-import warnings
 from typing import Callable, Dict, Optional, Tuple


@@ -66,8 +65,6 @@ class TFTrainer:
            :class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
            :class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else an
            instance of :class:`~transformers.WarmUp`.
-        kwargs:
-            Deprecated keyword arguments.
    """

    def __init__(
@@ -82,7 +79,6 @@ class TFTrainer:
            None,
            None,
        ),
-        **kwargs,
    ):
        assert parse(tf.__version__).release >= (2, 2, 0), (
            "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is %r "
@@ -98,13 +94,6 @@ class TFTrainer:
        self.gradient_accumulator = GradientAccumulator()
        self.global_step = 0
        self.epoch_logging = 0
-        if "prediction_loss_only" in kwargs:
-            warnings.warn(
-                "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a future version. Use `args.prediction_loss_only` instead.",
-                FutureWarning,
-            )
-            self.args.prediction_loss_only = kwargs.pop("prediction_loss_only")
-        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."

        if tb_writer is not None:
            self.tb_writer = tb_writer
@@ -249,12 +238,6 @@ class TFTrainer:
            WANDB_DISABLED:
                (Optional): boolean - defaults to false, set to "true" to disable wandb entirely.
        """
-        if hasattr(self, "_setup_wandb"):
-            warnings.warn(
-                "The `_setup_wandb` method is deprecated and won't be called in a future version, define `setup_wandb` in your subclass.",
-                FutureWarning,
-            )
-            return self._setup_wandb()

        logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"')
        combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()}
@@ -304,14 +287,6 @@ class TFTrainer:

        Works both with or without labels.
        """
-        if hasattr(self, "_prediction_loop"):
-            warnings.warn(
-                "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.",
-                FutureWarning,
-            )
-            return self._prediction_loop(
-                dataset, steps, num_examples, description, prediction_loss_only=prediction_loss_only
-            )

        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
@@ -393,12 +368,6 @@ class TFTrainer:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
-        if hasattr(self, "_log"):
-            warnings.warn(
-                "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.",
-                FutureWarning,
-            )
-            return self._log(logs)
        logs["epoch"] = self.epoch_logging

        if self.tb_writer:
@@ -733,12 +702,6 @@ class TFTrainer:
        Returns:
            A tuple of two :obj:`tf.Tensor`: The loss and logits.
        """
-        if hasattr(self, "_run_model"):
-            warnings.warn(
-                "The `_run_model` method is deprecated and won't be called in a future version, define `run_model` in your subclass.",
-                FutureWarning,
-            )
-            return self._run_model(features, labels, training)

        if self.args.past_index >= 0 and getattr(self, "_past", None) is not None:
            features["mems"] = self._past

--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
 import dataclasses
 import json
 import os
-import warnings
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple
@@ -198,10 +197,6 @@ class TrainingArguments:
    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
-    evaluate_during_training: bool = field(
-        default=False,
-        metadata={"help": "Run evaluation during training at each logging step."},
-    )
    evaluation_strategy: EvaluationStrategy = field(
        default="no",
        metadata={"help": "Run evaluation during training at each logging step."},
@@ -340,12 +335,6 @@ class TrainingArguments:
    def __post_init__(self):
        if self.disable_tqdm is None:
            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
-        if self.evaluate_during_training is True:
-            self.evaluation_strategy = EvaluationStrategy.STEPS
-            warnings.warn(
-                "The `evaluate_during_training` argument is deprecated in favor of `evaluation_strategy` (which has more options)",
-                FutureWarning,
-            )
        self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
        if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
            self.do_eval = True

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -73,7 +73,6 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
 {%- elif cookiecutter.tokenizer_type == "Standalone" %}
-import warnings
 from typing import List, Optional

 from tokenizers import ByteLevelBPETokenizer
@@ -234,13 +233,6 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
@@ -285,29 +277,6 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
        )
        self.add_prefix_space = add_prefix_space

-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = None
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-            is_split_into_words = kwargs.pop("is_pretokenized")
-
-        is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = None
-        if "is_pretokenized" in kwargs:
-            warnings.warn(
-                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
-                FutureWarning,
-            )
-        is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
-        return super()._encode_plus(*args, **kwargs)
-
-
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:

--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -213,7 +213,9 @@ class GPT2ModelTester:
        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)

        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -255,7 +257,7 @@ class GPT2ModelTester:

        # get two different outputs
        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -286,7 +288,9 @@ class GPT2ModelTester:
        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)

        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+            "last_hidden_state"
+        ]
        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])

        # select random slice

--- a/tests/test_pipelines_fill_mask.py
+++ b/tests/test_pipelines_fill_mask.py
 import unittest

-import pytest
-
 from transformers import pipeline
 from transformers.testing_utils import require_tf, require_torch, slow

@@ -53,13 +51,6 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
    ]
    expected_check_keys = ["sequence"]

-    @require_torch
-    def test_torch_topk_deprecation(self):
-        # At pipeline initialization only it was not enabled at pipeline
-        # call site before
-        with pytest.warns(FutureWarning, match=r".*use `top_k`.*"):
-            pipeline(task="fill-mask", model=self.small_models[0], topk=1)
-
    @require_torch
    def test_torch_fill_mask(self):
        valid_inputs = "My name is <mask>"

--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -83,7 +83,7 @@ class AutoTokenizerTest(unittest.TestCase):
            else:
                self.assertEqual(tokenizer.do_lower_case, False)

-            self.assertEqual(tokenizer.max_len, 512)
+            self.assertEqual(tokenizer.model_max_length, 512)

    @require_tokenizers
    def test_tokenizer_identifier_non_existent(self):