Rename add_start_docstrings_to_callable (#8120)

378142af · Sylvain Gugger · GitHub · 6241c873 · 378142af · 378142af
Unverified Commit 378142af authored Oct 28, 2020 by Sylvain Gugger Committed by GitHub Oct 28, 2020
20 changed files
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -20,7 +20,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_albert import (
    ALBERT_INPUTS_DOCSTRING,
    ALBERT_START_DOCSTRING,
@@ -87,7 +87,7 @@ class AlbertModelWithPabee(AlbertModel):
        message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
        print(message)
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -230,7 +230,7 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,

--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -22,7 +22,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import (
    BERT_INPUTS_DOCSTRING,
    BERT_START_DOCSTRING,
@@ -92,7 +92,7 @@ class BertModelWithPabee(BertModel):
        message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
        print(message)
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -254,7 +254,7 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,

--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -2,7 +2,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import (
    BERT_INPUTS_DOCSTRING,
    BERT_START_DOCSTRING,
@@ -134,7 +134,7 @@ class DeeBertModel(BertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -288,7 +288,7 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,

--- a/examples/deebert/src/modeling_highway_roberta.py
+++ b/examples/deebert/src/modeling_highway_roberta.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from transformers.configuration_roberta import RobertaConfig
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_roberta import ROBERTA_INPUTS_DOCSTRING, ROBERTA_START_DOCSTRING, RobertaEmbeddings
 from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayException, entropy
@@ -45,7 +45,7 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,

--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from emmental import MaskedBertConfig
 from emmental.modules import MaskedLinear
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import ACT2FN, BertLayerNorm, load_tf_weights_in_bert
 from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
@@ -498,7 +498,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -671,7 +671,7 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -756,7 +756,7 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -846,7 +846,7 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -932,7 +932,7 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -425,7 +425,7 @@ def add_start_docstrings(*docstr):
    return docstring_decorator
-def add_start_docstrings_to_callable(*docstr):
+def add_start_docstrings_to_model_forward(*docstr):
    def docstring_decorator(fn):
        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)

--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -30,7 +30,7 @@ from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -631,7 +631,7 @@ class AlbertModel(AlbertPreTrainedModel):
            inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
            self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",
@@ -727,7 +727,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
    def get_input_embeddings(self):
        return self.albert.embeddings.word_embeddings
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -879,7 +879,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
    def get_input_embeddings(self):
        return self.albert.embeddings.word_embeddings
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",
@@ -967,7 +967,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",
@@ -1055,7 +1055,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",
@@ -1143,7 +1143,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",
@@ -1242,7 +1242,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="albert-base-v2",

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -30,7 +30,7 @@ from .file_utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -846,7 +846,7 @@ class BartModel(PretrainedBartModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="facebook/bart-large",
@@ -981,7 +981,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        self.register_buffer("final_logits_bias", new_bias)
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(BART_GENERATION_EXAMPLE)
    def forward(
@@ -1147,7 +1147,7 @@ class BartForSequenceClassification(PretrainedBartModel):
        self.model._init_weights(self.classification_head.dense)
        self.model._init_weights(self.classification_head.out_proj)
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="facebook/bart-large",
@@ -1234,7 +1234,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
        self.model._init_weights(self.qa_outputs)
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="facebook/bart-large",

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -33,7 +33,7 @@ from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -748,7 +748,7 @@ class BertModel(BertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",
@@ -870,7 +870,7 @@ class BertForPreTraining(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.cls.predictions.decoder
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -983,7 +983,7 @@ class BertLMHeadModel(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.cls.predictions.decoder
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -1103,7 +1103,7 @@ class BertForMaskedLM(BertPreTrainedModel):
    def get_output_embeddings(self):
        return self.cls.predictions.decoder
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",
@@ -1206,7 +1206,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -1300,7 +1300,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",
@@ -1384,7 +1384,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",
@@ -1479,7 +1479,7 @@ class BertForTokenClassification(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",
@@ -1569,7 +1569,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="bert-base-uncased",

--- a/src/transformers/modeling_bert_generation.py
+++ b/src/transformers/modeling_bert_generation.py
@@ -24,7 +24,7 @@ from .configuration_bert_generation import BertGenerationConfig
 from .file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_bert import BertEncoder
@@ -293,7 +293,7 @@ class BertGenerationEncoder(BertGenerationPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/bert_for_seq_generation_L-24_bbc_encoder",
@@ -421,7 +421,7 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head.decoder
-    @add_start_docstrings_to_callable(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,

--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,7 +24,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from .modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from .utils import logging
@@ -349,7 +349,7 @@ class CTRLModel(CTRLPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].multi_head_attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="ctrl",
@@ -521,7 +521,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        return {"input_ids": input_ids, "past_key_values": past, "use_cache": kwargs["use_cache"]}
-    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="ctrl",

--- a/src/transformers/modeling_deberta.py
+++ b/src/transformers/modeling_deberta.py
@@ -24,7 +24,7 @@ from torch.nn import CrossEntropyLoss
 from .activations import ACT2FN
 from .configuration_deberta import DebertaConfig
-from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from .modeling_outputs import BaseModelOutput, SequenceClassifierOutput
 from .modeling_utils import PreTrainedModel
 from .utils import logging
@@ -858,7 +858,7 @@ class DebertaModel(DebertaPreTrainedModel):
        """
        raise NotImplementedError("The prune function is not implemented in DeBERTa model.")
-    @add_start_docstrings_to_callable(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="microsoft/deberta-base",
@@ -976,7 +976,7 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel):
    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)
-    @add_start_docstrings_to_callable(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="microsoft/deberta-base",

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -32,7 +32,7 @@ from .configuration_distilbert import DistilBertConfig
 from .file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -436,7 +436,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.transformer.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="distilbert-base-uncased",
@@ -509,7 +509,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def get_output_embeddings(self):
        return self.vocab_projector
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="distilbert-base-uncased",
@@ -595,7 +595,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="distilbert-base-uncased",
@@ -676,7 +676,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="distilbert-base-uncased",
@@ -772,7 +772,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="distilbert-base-uncased",
@@ -856,7 +856,9 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,

--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@@ -22,7 +22,12 @@ import torch
 from torch import Tensor, nn
 from .configuration_dpr import DPRConfig
-from .file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings
+from .file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from .modeling_bert import BertModel
 from .modeling_outputs import BaseModelOutputWithPooling
 from .modeling_utils import PreTrainedModel
@@ -431,7 +436,7 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
        self.ctx_encoder = DPREncoder(config)
        self.init_weights()
-    @add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRContextEncoderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -509,7 +514,7 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
        self.question_encoder = DPREncoder(config)
        self.init_weights()
-    @add_start_docstrings_to_callable(DPR_ENCODERS_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DPR_ENCODERS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRQuestionEncoderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -586,7 +591,7 @@ class DPRReader(DPRPretrainedReader):
        self.span_predictor = DPRSpanPredictor(config)
        self.init_weights()
-    @add_start_docstrings_to_callable(DPR_READER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(DPR_READER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=DPRReaderOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,

--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -30,7 +30,7 @@ from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -693,7 +693,7 @@ class ElectraModel(ElectraPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",
@@ -791,7 +791,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",
@@ -873,7 +873,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
        self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
        self.init_weights()
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -971,7 +971,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
    def get_output_embeddings(self):
        return self.generator_lm_head
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",
@@ -1060,7 +1060,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",
@@ -1147,7 +1147,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",
@@ -1248,7 +1248,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="google/electra-small-discriminator",

--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -19,7 +19,7 @@ from typing import Optional
 from .configuration_encoder_decoder import EncoderDecoderConfig
 from .configuration_utils import PretrainedConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings
+from .file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from .modeling_outputs import Seq2SeqLMOutput
 from .modeling_utils import PreTrainedModel
 from .utils import logging
@@ -335,7 +335,7 @@ class EncoderDecoderModel(PreTrainedModel):
        config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
        return cls(encoder=encoder, decoder=decoder, config=config)
-    @add_start_docstrings_to_callable(ENCODER_DECODER_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -21,7 +21,7 @@ import torch
 from torch.nn import functional as F
 from .configuration_flaubert import FlaubertConfig
-from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from .modeling_outputs import BaseModelOutput
 from .modeling_xlm import (
    XLMForMultipleChoice,
@@ -140,7 +140,7 @@ class FlaubertModel(XLMModel):
        self.layerdrop = getattr(config, "layerdrop", 0.0)
        self.pre_norm = getattr(config, "pre_norm", False)
-    @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="flaubert/flaubert_base_cased",

--- a/src/transformers/modeling_fsmt.py
+++ b/src/transformers/modeling_fsmt.py
@@ -43,7 +43,7 @@ from .file_utils import (
    add_code_sample_docstrings,
    add_end_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, Seq2SeqLMOutput, Seq2SeqModelOutput
@@ -899,7 +899,7 @@ class FSMTModel(PretrainedFSMTModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(FSMT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="facebook/wmt19-ru-en",
@@ -1039,7 +1039,7 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
        return new_embeddings
-    @add_start_docstrings_to_callable(FSMT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(FSMT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    @add_end_docstrings(FSMT_GENERATION_EXAMPLE)
    def forward(

--- a/src/transformers/modeling_funnel.py
+++ b/src/transformers/modeling_funnel.py
@@ -30,7 +30,7 @@ from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -906,7 +906,7 @@ class FunnelBaseModel(FunnelPreTrainedModel):
    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small-base",
@@ -983,7 +983,7 @@ class FunnelModel(FunnelPreTrainedModel):
    def set_input_embeddings(self, new_embeddings):
        self.embeddings.word_embeddings = new_embeddings
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small",
@@ -1082,7 +1082,7 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
        self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
        self.init_weights()
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=FunnelForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -1167,7 +1167,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_head
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small",
@@ -1240,7 +1240,7 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
        self.classifier = FunnelClassificationHead(config, config.num_labels)
        self.init_weights()
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small-base",
@@ -1317,7 +1317,7 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
        self.classifier = FunnelClassificationHead(config, 1)
        self.init_weights()
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small-base",
@@ -1403,7 +1403,7 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small",
@@ -1485,7 +1485,7 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(FUNNEL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="funnel-transformer/small",

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -30,7 +30,7 @@ from .file_utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from .modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
@@ -502,7 +502,7 @@ class GPT2Model(GPT2PreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="gpt2",
@@ -723,7 +723,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            "attention_mask": attention_mask,
        }
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="gpt2",
@@ -837,7 +837,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            "use_cache": kwargs.get("use_cache"),
        }
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
@@ -987,7 +987,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint="microsoft/dialogrpt",