PyTorch CTRL + Style

7511f3dd · Lysandre · Lysandre Debut · 980211a6 · 7511f3dd · 7511f3dd
Commit 7511f3dd authored Jan 20, 2020 by Lysandre Committed by Lysandre Debut Jan 23, 2020
20 changed files
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
 CTRL
 ----------------------------------------------------
+CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
+by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
+corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+refer to the PyTorch documentation for all matter related to general usage and behavior.
 Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
 you'll be able to convert from TF to our HuggingFace/Transformers format using the 
 ``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).

--- a/src/transformers/configuration_xlm_roberta.py
+++ b/src/transformers/configuration_xlm_roberta.py
@@ -38,5 +38,6 @@ class XLMRobertaConfig(RobertaConfig):
    This class overrides :class:`~transformers.RobertaConfig`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
    model_type = "xlm-roberta"
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -607,7 +607,6 @@ class AlbertMLMHead(nn.Module):
    "Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -698,7 +697,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
    ALBERT_START_DOCSTRING,
 )
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
@@ -794,7 +792,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
    ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -813,7 +813,6 @@ class BertModel(BertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class BertForPreTraining(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -907,11 +906,8 @@ class BertForPreTraining(BertPreTrainedModel):
        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-@add_start_docstrings(
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
-)
 class BertForMaskedLM(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -1018,11 +1014,9 @@ class BertForMaskedLM(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-    BERT_START_DOCSTRING,
 )
 class BertForNextSentencePrediction(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -1105,7 +1099,6 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
@@ -1198,7 +1191,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class BertForMultipleChoice(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -1294,7 +1286,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
@@ -1386,7 +1377,6 @@ class BertForTokenClassification(BertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super(BertForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -58,19 +58,20 @@ class CamembertModel(RobertaModel):
    This class overrides :class:`~transformers.RobertaModel`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 @add_start_docstrings(
-    """CamemBERT Model with a `language modeling` head on top. """,
+    """CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING,
-    CAMEMBERT_START_DOCSTRING,
 )
 class CamembertForMaskedLM(RobertaForMaskedLM):
    """
    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -85,6 +86,7 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification):
    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -99,9 +101,11 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 @add_start_docstrings(
    """CamemBERT Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
@@ -112,6 +116,6 @@ class CamembertForTokenClassification(RobertaForTokenClassification):
    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = CamembertConfig
    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,7 +24,7 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings
+from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel
@@ -184,57 +184,53 @@ class CTRLPreTrainedModel(PreTrainedModel):
            module.weight.data.fill_(1.0)
-CTRL_START_DOCSTRING = r"""    CTRL model was proposed in
+CTRL_START_DOCSTRING = r"""    
-    `CTRL: A Conditional Transformer Language Model for Controllable Generation`_
-    by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-    It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
-    corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matter related to general usage and behavior.
-    .. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`:
-        https://www.github.com/salesforce/ctrl
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
        config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
-CTRL_INPUTS_DOCSTRING = r"""    Inputs:
+CTRL_INPUTS_DOCSTRING = r"""    
-        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. 
-            CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
-            the right rather than the left.
            Indices can be obtained using :class:`transformers.CTRLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer):
+            `What are input IDs? <../glossary.html#input-ids>`__
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            `What are attention masks? <../glossary.html#attention-mask>`__
-            The embeddings from these tokens will be summed with the respective token embeddings.
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): 
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+            Segment token indices to indicate first and second portions of the inputs.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
-            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+        input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
 """
@@ -243,35 +239,8 @@ CTRL_INPUTS_DOCSTRING = r"""    Inputs:
 @add_start_docstrings(
    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
    CTRL_START_DOCSTRING,
-    CTRL_INPUTS_DOCSTRING,
 )
 class CTRLModel(CTRLPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the last layer of the model.
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
-            that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-    Examples::
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-    """
    def __init__(self, config):
        super().__init__(config)
@@ -310,6 +279,7 @@ class CTRLModel(CTRLPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.h[layer].attn.prune_heads(heads)
+    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -320,6 +290,36 @@ class CTRLModel(CTRLPreTrainedModel):
        head_mask=None,
        inputs_embeds=None,
    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`CTRLConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the last layer of the model.
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
+            Contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
+            should not be passed as input ids as they have already been computed.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+        model = CTRLModel.from_pretrained('ctrl')
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
@@ -435,50 +435,10 @@ class CTRLModel(CTRLPreTrainedModel):
 @add_start_docstrings(
    """The CTRL Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """,
+    (linear layer with weights tied to the input embeddings). """,
    CTRL_START_DOCSTRING,
-    CTRL_INPUTS_DOCSTRING,
 )
 class CTRLLMHeadModel(CTRLPreTrainedModel):
-    r"""
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Language modeling loss.
-        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        **past**:
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(2, batch_size, num_heads, sequence_length, embed_size_per_head)``:
-            that contains pre-computed hidden-states (key and values in the attention blocks).
-            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
-            should not be passed as input ids as they have already been computed.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-    Examples::
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-    """
    def __init__(self, config):
        super().__init__(config)
@@ -499,6 +459,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        inputs.update(kwargs)
        return inputs
+    @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
@@ -510,6 +471,49 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        inputs_embeds=None,
        labels=None,
    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            All labels set to ``-100`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`~transformers.CTRLConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
+            Language modeling loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
+            Contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
+            should not be passed as input ids as they have already been computed.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        import torch
+        from transformers import CTRLTokenizer, CTRLLMHeadModel
+        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+        model = CTRLLMHeadModel.from_pretrained('ctrl')
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
+        """
        transformer_outputs = self.transformer(
            input_ids,
            past=past,

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -390,7 +390,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertModel(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -484,11 +483,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
+    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
-    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.output_attentions = config.output_attentions
@@ -567,7 +564,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
@@ -645,7 +641,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
@@ -745,7 +740,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class DistilBertForTokenClassification(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -269,12 +269,6 @@ GPT2_START_DOCSTRING = r"""
    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
    refer to the PyTorch documentation for all matter related to general usage and behavior.
-    .. _`Language Models are Unsupervised Multitask Learners`:
-        https://openai.com/blog/better-language-models/
-    .. _`torch.nn.Module`:
-        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
@@ -328,7 +322,6 @@ GPT2_INPUTS_DOCSTRING = r"""
    GPT2_START_DOCSTRING,
 )
 class GPT2Model(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.output_hidden_states = config.output_hidden_states
@@ -514,7 +507,6 @@ class GPT2Model(GPT2PreTrainedModel):
    GPT2_START_DOCSTRING,
 )
 class GPT2LMHeadModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)
@@ -624,7 +616,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    GPT2_START_DOCSTRING,
 )
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1

--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -338,7 +338,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
    OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.output_attentions = config.output_attentions
@@ -493,7 +492,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = OpenAIGPTModel(config)
@@ -587,7 +585,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    OPENAI_GPT_START_DOCSTRING,
 )
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -150,6 +150,7 @@ class RobertaModel(BertModel):
    This class overrides :class:`~transformers.BertModel`. Please check the
    superclass for the appropriate documentation alongside usage examples.
    """
    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
    base_model_prefix = "roberta"
@@ -167,9 +168,7 @@ class RobertaModel(BertModel):
        self.embeddings.word_embeddings = value
-@add_start_docstrings(
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
-    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
-)
 class RobertaForMaskedLM(BertPreTrainedModel):
    config_class = RobertaConfig
    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -652,7 +651,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
-            inputs_embeds=inputs_embeds
+            inputs_embeds=inputs_embeds,
        )
        sequence_output = outputs[0]

--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -560,7 +560,6 @@ ALBERT_INPUTS_DOCSTRING = r"""
    ALBERT_START_DOCSTRING,
 )
 class TFAlbertModel(TFAlbertPreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(config, **kwargs)
        self.num_hidden_layers = config.num_hidden_layers
@@ -705,11 +704,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
        return outputs
-@add_start_docstrings(
+@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
-    """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING
-)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs)
@@ -766,7 +762,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
    ALBERT_START_DOCSTRING,
 )
 class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -660,7 +660,6 @@ BERT_INPUTS_DOCSTRING = r"""
    BERT_START_DOCSTRING,
 )
 class TFBertModel(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = TFBertMainLayer(config, name="bert")
@@ -711,7 +710,6 @@ class TFBertModel(TFBertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class TFBertForPreTraining(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -767,11 +765,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-@add_start_docstrings(
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-    """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING
-)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -822,11 +817,9 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-    BERT_START_DOCSTRING,
 )
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -879,7 +872,6 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -938,7 +930,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -1049,7 +1040,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class TFBertForTokenClassification(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -1108,7 +1098,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
    BERT_START_DOCSTRING,
 )
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -536,7 +536,6 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
@@ -594,11 +593,9 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
 @add_start_docstrings(
-    """DistilBert Model with a `masked language modeling` head on top. """,
+    """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
-    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.output_attentions = config.output_attentions
@@ -665,7 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -730,7 +726,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -786,7 +781,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -444,7 +444,6 @@ GPT2_INPUTS_DOCSTRING = r"""
    GPT2_START_DOCSTRING,
 )
 class TFGPT2Model(TFGPT2PreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -494,7 +493,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
    GPT2_START_DOCSTRING,
 )
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -557,7 +555,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
    GPT2_START_DOCSTRING,
 )
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        config.num_labels = 1

--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -427,7 +427,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
    OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -473,7 +472,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
    OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -531,7 +529,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
    OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        config.num_labels = 1

--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -180,7 +180,6 @@ ROBERTA_INPUTS_DOCSTRING = r"""
    ROBERTA_START_DOCSTRING,
 )
 class TFRobertaModel(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.roberta = TFRobertaMainLayer(config, name="roberta")
@@ -256,11 +255,8 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
        return x
-@add_start_docstrings(
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
-    """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING
-)
 class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
@@ -340,7 +336,6 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
    ROBERTA_START_DOCSTRING,
 )
 class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -394,7 +389,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
    ROBERTA_START_DOCSTRING,
 )
 class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -687,7 +687,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
    TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@@ -737,7 +736,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
    TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = TFTransfoXLMainLayer(config, name="transformer")

--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -571,7 +571,6 @@ XLM_INPUTS_DOCSTRING = r"""
    XLM_START_DOCSTRING,
 )
 class TFXLMModel(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -650,7 +649,6 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
    XLM_START_DOCSTRING,
 )
 class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLMMainLayer(config, name="transformer")
@@ -705,7 +703,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
    XLM_START_DOCSTRING,
 )
 class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -760,7 +757,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
    XLM_START_DOCSTRING,
 )
 class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLMMainLayer(config, name="transformer")

--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -780,7 +780,6 @@ XLNET_INPUTS_DOCSTRING = r"""
    XLNET_START_DOCSTRING,
 )
 class TFXLNetModel(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -830,7 +829,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
    XLNET_START_DOCSTRING,
 )
 class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -896,7 +894,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
    XLNET_START_DOCSTRING,
 )
 class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -961,7 +958,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
    XLNET_START_DOCSTRING,
 )
 class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
@@ -1015,11 +1011,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
        return outputs  # return logits, (mems), (hidden states), (attentions)
-@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+@add_start_docstrings(
+    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XLNET_START_DOCSTRING)
+    XLNET_START_DOCSTRING,
+)
 class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")

--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -550,7 +550,6 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
    TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLModel(TransfoXLPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.output_attentions = config.output_attentions
@@ -803,7 +802,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
    TRANSFO_XL_START_DOCSTRING,
 )
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.transformer = TransfoXLModel(config)