[doc] remove the implied defaults to :obj:`None`, s/True/ :obj:`True/, etc. (#6956)

* remove the implied defaults to :obj:`None` * fix bug in the original * replace to :obj:`True`, :obj:`False`

[doc] remove the implied defaults to :obj:`None`, s/True/ :obj:`True/, etc. (#6956)
* remove the implied defaults to :obj:`None` * fix bug in the original * replace to :obj:`True`, :obj:`False`
48ff6d51 · Stas Bekman · GitHub · eff274d6 · 48ff6d51 · 48ff6d51
Unverified Commit 48ff6d51 authored Sep 04, 2020 by Stas Bekman Committed by GitHub Sep 04, 2020
20 changed files
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -242,7 +242,7 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
        labels=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in ``[0, ..., config.num_labels - 1]``.
                If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),

--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -266,7 +266,7 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
        labels=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

--- a/examples/deebert/src/modeling_highway_bert.py
+++ b/examples/deebert/src/modeling_highway_bert.py
@@ -302,7 +302,7 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

--- a/examples/deebert/src/modeling_highway_roberta.py
+++ b/examples/deebert/src/modeling_highway_roberta.py
@@ -59,7 +59,7 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
        train_highway=False,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -426,35 +426,35 @@ MASKED_BERT_INPUTS_DOCSTRING = r"""
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
            is used in the cross-attention if the model is configured as a decoder.
            Mask values selected in ``[0, 1]``:
@@ -684,7 +684,7 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the sequence classification/regression loss.
                Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
@@ -769,7 +769,7 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the multiple choice classification loss.
                Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
                of the input tensors. (see `input_ids` above)
@@ -859,7 +859,7 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Labels for computing the token classification loss.
                Indices should be in ``[0, ..., config.num_labels - 1]``.
            threshold (:obj:`float`):
@@ -946,11 +946,11 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
        threshold=None,
    ):
        r"""
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for position (index) of the start of the labelled span for computing the token classification loss.
                Positions are clamped to the length of the sequence (`sequence_length`).
                Position outside of the sequence are not taken into account for computing the loss.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for position (index) of the end of the labelled span for computing the token classification loss.
                Positions are clamped to the length of the sequence (`sequence_length`).
                Position outside of the sequence are not taken into account for computing the loss.

--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -65,17 +65,17 @@ BART_CONFIG_ARGS_DOC = r"""
            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, optional, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, optional, defaults to False):
+        add_bias_logits (:obj:`bool`, optional, defaults to :obj:`False`):
            True for marian only.
-        normalize_before (:obj:`bool`, optional, defaults to False):
+        normalize_before (:obj:`bool`, optional, defaults to :obj:`False`):
            Call layernorm before attention ops. True for pegasus, mbart. False for bart. FIXME: marian?
-        normalize_embedding (:obj:`bool`, optional, defaults to True):
+        normalize_embedding (:obj:`bool`, optional, defaults to :obj:`True`):
            Call layernorm after embeddings. Only True for Bart.
-        static_position_embeddings (:obj:`bool`, optional, defaults to False):
+        static_position_embeddings (:obj:`bool`, optional, defaults to :obj:`False`):
            Don't learn positional embeddings, use sinusoidal. True for marian, pegasus.
-        add_final_layer_norm (:obj:`bool`, optional, defaults to False):
+        add_final_layer_norm (:obj:`bool`, optional, defaults to :obj:`False`):
            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, optional, defaults to False):
+        scale_embedding (:obj:`bool`, optional, defaults to :obj:`False`):
            Scale embeddings by diving by sqrt(d_model).
        eos_token_id (:obj:`int`, optional, defaults to 2)
            End of stream token id.
@@ -91,7 +91,7 @@ BART_CONFIG_ARGS_DOC = r"""
            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
        num_labels: (:obj:`int`, optional, defaults to 3):
            for SequenceClassification
-        is_encoder_decoder (:obj:`bool`, optional, defaults to True):
+        is_encoder_decoder (:obj:`bool`, optional, defaults to :obj:`True`):
            Whether this is an encoder/decoder model
        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only true for `bart-large-cnn`.

--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -88,7 +88,7 @@ class BertConfig(PretrainedConfig):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+        gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
    Example::

--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -88,7 +88,7 @@ class ElectraConfig(PretrainedConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.ElectraForMultipleChoice`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.ElectraForMultipleChoice`.
            'gelu' => add a gelu activation to the output, Other => no activation.

--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -117,7 +117,7 @@ class FlaubertConfig(XLMConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLMForSequenceClassification`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLMForSequenceClassification`.
            'tanh' => add a tanh activation to the output, Other => no activation.

--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -85,7 +85,7 @@ class GPT2Config(PretrainedConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.GPT2DoubleHeadsModel`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.GPT2DoubleHeadsModel`.
            'tanh' => add a tanh activation to the output, Other => no activation.

--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -85,25 +85,25 @@ class LxmertConfig(PretrainedConfig):
        num_attr_labels (:obj:`int`, optional, defaults to 400):
            This represents the total number of semantically unique attributes that lxmert will be able to classify a pooled-object feature
            as possessing.
-        task_matched (:obj:`bool`, optional, defaults to True):
+        task_matched (:obj:`bool`, optional, defaults to :obj:`True`):
            This task is used for sentence-image matching. If the sentence correctly describes the image the label will be 1.
            If the sentence does not correctly describe the image, the label will be 0.
-        task_mask_lm (:obj:`bool`, optional, defaults to True):
+        task_mask_lm (:obj:`bool`, optional, defaults to :obj:`True`):
            This task is the defacto masked langauge modeling used in pretraining models such as BERT.
-        task_obj_predict (:obj:`bool`, optional, defaults to True):
+        task_obj_predict (:obj:`bool`, optional, defaults to :obj:`True`):
            This task is set to true if the user would like to perform one of the following loss objectives:
            object predicition, atrribute predicition, feature regression
-        task_qa (:obj:`bool`, optional, defaults to True):
+        task_qa (:obj:`bool`, optional, defaults to :obj:`True`):
            This task specifies whether or not Lxmert will calculate the question-asnwering loss objective
-        visual_obj_loss (:obj:`bool`, optional, defaults to True):
+        visual_obj_loss (:obj:`bool`, optional, defaults to :obj:`True`):
            This task specifies whether or not Lxmert will calculate the object-prediction loss objective
-        visual_attr_loss (:obj:`bool`, optional, defaults to True):
+        visual_attr_loss (:obj:`bool`, optional, defaults to :obj:`True`):
            This task specifies whether or not Lxmert will calculate the attribute-prediction loss objective
-        visual_feat_loss (:obj:`bool`, optional, defaults to True):
+        visual_feat_loss (:obj:`bool`, optional, defaults to :obj:`True`):
            This task specifies whether or not Lxmert will calculate the feature-regression loss objective
-        output_attentions (:obj:`bool`, optional, defaults to False):
+        output_attentions (:obj:`bool`, optional, defaults to :obj:`False`):
                if True, the vision, langauge, and cross-modality layers will be returned
-        output_hidden_states (:obj:`bool`, optional, defaults to False):
+        output_hidden_states (:obj:`bool`, optional, defaults to :obj:`False`):
                if True, final cross-modality hidden states for language and vision features will be returned
    """

--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -67,15 +67,15 @@ class MobileBertConfig(PretrainedConfig):
            The ID of the token in the word embedding to use as padding.
        embedding_size (:obj:`int`, optional, defaults to 128):
            The dimension of the word embedding vectors.
-        trigram_input (:obj:`bool`, optional, defaults to True):
+        trigram_input (:obj:`bool`, optional, defaults to :obj:`True`):
            Use a convolution of trigram as input.
-        use_bottleneck (:obj:`bool`, optional, defaults to True):
+        use_bottleneck (:obj:`bool`, optional, defaults to :obj:`True`):
            Whether to use bottleneck in BERT.
        intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
            Size of bottleneck layer output.
-        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to :obj:`False`):
            Whether to use attention inputs from the bottleneck transformation.
-        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to :obj:`True`):
            Whether to use the same linear transformation for query&key in the bottleneck.
        num_feedforward_networks (:obj:`int`, optional, defaults to 4):
            Number of FFNs in a block.

--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -81,7 +81,7 @@ class OpenAIGPTConfig(PretrainedConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
            'tanh' => add a tanh activation to the output, Other => no activation.

--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -45,7 +45,7 @@ class ReformerConfig(PretrainedConfig):
            LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
            For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
            For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+        axial_pos_embds (:obj:`bool`, optional, defaults to :obj:`True`):
            If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
        axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
            The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
@@ -77,7 +77,7 @@ class ReformerConfig(PretrainedConfig):
            Dimensionality of the output hidden states of the residual attention blocks.
        initializer_range (:obj:`float`, optional, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        is_decoder (:obj:`bool`, optional, defaults to False):
+        is_decoder (:obj:`bool`, optional, defaults to :obj:`False`):
            If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
            When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):

--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -65,7 +65,7 @@ class RetriBertConfig(PretrainedConfig):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        share_encoders (:obj:`bool`, optional, defaults to True):
+        share_encoders (:obj:`bool`, optional, defaults to :obj:`True`):
            Whether to use the same Bert-type encoder for the queries and document
        projection_dim (:obj:`int`, optional, defaults to 128):
            Final dimension of the query and document representation after projection

--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -116,7 +116,7 @@ class XLMConfig(PretrainedConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLMForSequenceClassification`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLMForSequenceClassification`.
            'tanh' => add a tanh activation to the output, Other => no activation.

--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -65,12 +65,12 @@ class XLNetConfig(PretrainedConfig):
            The epsilon used by the layer normalization layers.
        dropout (:obj:`float`, optional, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+        mem_len (:obj:`int` or :obj:`None`, optional):
            The number of tokens to cache. The key/value pairs that have already been pre-computed
            in a previous forward pass won't be re-computed. See the
            `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
            for more information.
-        reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+        reuse_len (:obj:`int` or :obj:`None`, optional):
            The number of tokens in the current batch to be cached and reused in the future.
        bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
            Whether to use bidirectional input pipeline. Usually set to `True` during
@@ -94,7 +94,7 @@ class XLNetConfig(PretrainedConfig):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
            Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
            Argument used when doing sequence summary. Used in for the multiple choice head in
            :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
            'tanh' => add a tanh activation to the output, Other => no activation.

--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -476,36 +476,36 @@ ALBERT_INPUTS_DOCSTRING = r"""
            :func:`transformers.PreTrainedTokenizer` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_attentions (:obj:`bool`, `optional`):
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_hidden_states (:obj:`bool`, `optional`):
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        return_dict (:obj:`bool`, `optional`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
 """
@@ -674,12 +674,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
        **kwargs,
    ):
        r"""
-            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
                Labels for computing the masked language modeling loss.
                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
                in ``[0, ..., config.vocab_size]``
-            sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+            sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
                Indices should be in ``[0, 1]``.
                ``0`` indicates original order (sequence A, then sequence B),
@@ -829,7 +829,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        **kwargs
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
@@ -915,7 +915,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
@@ -999,7 +999,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
        """
@@ -1083,11 +1083,11 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
@@ -1179,7 +1179,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -97,17 +97,17 @@ BART_INPUTS_DOCSTRING = r"""
               Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them.
            Padding will be ignored by default should you provide it.
            Indices can be obtained using :class:`transformers.BartTokenizer.encode(text)`.
-        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices in input_ids.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
-            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
+            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`) is a sequence of hidden-states at the output of the last layer of the encoder.
            Used in the cross-attention of the decoder.
-        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
            If you want to change padding behavior, you should read :func:`~transformers.modeling_bart._prepare_decoder_inputs` and modify.
            See diagram 1 in the paper for more info on the default strategy
@@ -120,11 +120,11 @@ BART_INPUTS_DOCSTRING = r"""
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            If `use_cache` is True, ``past_key_values`` are returned and can be used to speed up decoding (see
            ``past_key_values``).
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_attentions (:obj:`bool`, `optional`):
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_hidden_states (:obj:`bool`, `optional`):
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        return_dict (:obj:`bool`, `optional`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
 """
@@ -1012,7 +1012,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
        **unused,
    ):
        r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Labels for computing the masked language modeling loss.
                Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
@@ -1177,7 +1177,7 @@ class BartForSequenceClassification(PretrainedBartModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
@@ -1264,11 +1264,11 @@ class BartForQuestionAnswering(PretrainedBartModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -664,36 +664,36 @@ BERT_INPUTS_DOCSTRING = r"""
            :func:`transformers.PreTrainedTokenizer.__call__` for details.
            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_attentions (:obj:`bool`, `optional`):
            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        output_hidden_states (:obj:`bool`, `optional`):
            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
-        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
+        return_dict (:obj:`bool`, `optional`):
            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
            plain tuple.
 """
@@ -768,10 +768,10 @@ class BertModel(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
            is used in the cross-attention if the model is configured as a decoder.
            Mask values selected in ``[0, 1]``:
@@ -883,12 +883,12 @@ class BertForPreTraining(BertPreTrainedModel):
        **kwargs
    ):
        r"""
-            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
                Labels for computing the masked language modeling loss.
                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
                in ``[0, ..., config.vocab_size]``
-            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+            next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
                Indices should be in ``[0, 1]``.
                ``0`` indicates sequence B is a continuation of sequence A,
@@ -992,15 +992,15 @@ class BertLMHeadModel(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                if the model is configured as a decoder.
-            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Mask to avoid performing attention on the padding token indices of the encoder input. This mask
                is used in the cross-attention if the model is configured as a decoder.
                Mask values selected in ``[0, 1]``:
                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                Labels for computing the left-to-right language modeling loss (next word prediction).
                Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -1114,7 +1114,7 @@ class BertForMaskedLM(BertPreTrainedModel):
        **kwargs
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -1210,7 +1210,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
                Indices should be in ``[0, 1]``.
                ``0`` indicates sequence B is a continuation of sequence A,
@@ -1306,7 +1306,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
@@ -1389,7 +1389,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)
@@ -1479,7 +1479,7 @@ class BertForTokenClassification(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
        """
@@ -1565,11 +1565,11 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        return_dict=None,
    ):
        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.