Docs: formatting nits (#32247)

* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Docs: formatting nits (#32247)
* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
e68ec18c · Joao Gante · GitHub · 2fbbcf50 · e68ec18c · e68ec18c
Unverified Commit e68ec18c authored Jul 30, 2024 by Joao Gante Committed by GitHub Jul 30, 2024
20 changed files
--- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
@@ -175,7 +175,7 @@ class TFConvNextV2Layer(keras.layers.Layer):
            Model configuration class.
        dim (`int`):
            Number of input channels.
-        drop_path (`float`, defaults to 0.0):
+        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    """


--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -1077,7 +1077,7 @@ class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -37,8 +37,8 @@ class DbrxAttentionConfig(PretrainedConfig):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*):
            If set, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
+        kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
    """

    def __init__(
@@ -92,11 +92,11 @@ class DbrxFFNConfig(PretrainedConfig):
        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of the activation function along with
            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
-        ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
-        moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
-        moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
+        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
+        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
+        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
+        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
    """


--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -144,7 +144,7 @@ def load_balancing_loss_func(
            Number of experts.
        top_k (`int`):
            The number of experts each token is routed to.
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

@@ -757,16 +757,16 @@ class DbrxBlock(nn.Module):
        Args:
            hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
-            attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
+            attention_mask (`torch.Tensor`, *optional*): attention mask of size (batch_size, sequence_length)
                if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
                if default attention is used.
-            past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
-            output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
+            past_key_value (`Tuple(torch.Tensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all
                attention layers. See `attentions` under returned tensors for more detail.
-            output_router_logits (`bool`, optional): Whether or not to return the router logits.
-            use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
+            output_router_logits (`bool`, *optional*): Whether or not to return the router logits.
+            use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are
                returned and can be used to speed up decoding (see `past_key_values`).
-            cache_position (`torch.LongTensor`, optional): position ids of the cache
+            cache_position (`torch.LongTensor`, *optional*): position ids of the cache
        """

        # Norm + Attention + Norm

--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -80,7 +80,7 @@ class DebertaConfig(PretrainedConfig):
        pos_att_type (`List[str]`, *optional*):
            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
            `["p2c", "c2p"]`.
-        layer_norm_eps (`float`, optional, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.

    Example:

--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -602,10 +602,10 @@ class DisentangledSelfAttention(nn.Module):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

-            output_attentions (`bool`, optional):
+            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

-            query_states (`torch.FloatTensor`, optional):
+            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):

--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -669,10 +669,10 @@ class TFDebertaDisentangledSelfAttention(keras.layers.Layer):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

-            return_att (`bool`, optional):
+            return_att (`bool`, *optional*):
                Whether return the attention matrix.

-            query_states (`tf.Tensor`, optional):
+            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`tf.Tensor`):

--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -80,7 +80,7 @@ class DebertaV2Config(PretrainedConfig):
        pos_att_type (`List[str]`, *optional*):
            The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`,
            `["p2c", "c2p"]`, `["p2c", "c2p"]`.
-        layer_norm_eps (`float`, optional, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.

    Example:

--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -678,10 +678,10 @@ class DisentangledSelfAttention(nn.Module):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

-            output_attentions (`bool`, optional):
+            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

-            query_states (`torch.FloatTensor`, optional):
+            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):

--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -738,10 +738,10 @@ class TFDebertaV2DisentangledSelfAttention(keras.layers.Layer):
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

-            return_att (`bool`, optional):
+            return_att (`bool`, *optional*):
                Whether return the attention matrix.

-            query_states (`tf.Tensor`, optional):
+            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`tf.Tensor`):

--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -1019,7 +1019,7 @@ class ErnieForPreTraining(ErniePreTrainedModel):

                - 0 indicates sequence B is a continuation of sequence A,
                - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
                Used to hide legacy arguments that have been deprecated.

        Returns:

--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -993,7 +993,7 @@ class EsmForMaskedLM(EsmPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -1232,7 +1232,7 @@ class TFEsmForMaskedLM(TFEsmPreTrainedModel, TFMaskedLanguageModelingLoss):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -229,13 +229,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
    Dropout add function

    Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
            input tensor
-        residual (`torch.tensor`, *required*):
+        residual (`torch.tensor`):
            residual tensor
-        prob (`float`, *required*):
+        prob (`float`):
            dropout probability
-        training (`bool`, *required*):
+        training (`bool`):
            training mode
    """
    out = F.dropout(x, p=prob, training=training)
@@ -315,7 +315,7 @@ class FalconAttention(nn.Module):
        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`

        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
@@ -347,7 +347,7 @@ class FalconAttention(nn.Module):
        Merge heads together over the last dimension

        Args:
-            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]

--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -389,16 +389,16 @@ class FlavaImageCodebookConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.

    Args:
-        num_groups (`int`, defaults to 4):
+        num_groups (`int`, *optional*, defaults to 4):
            Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
            internal calculation and estimations.
-        input_channels (`int`, defaults to 3):
+        input_channels (`int`, *optional*, defaults to 3):
            Number of channels in the image to be passed.
-        num_blocks_per_group (`int`, defaults to 2):
+        num_blocks_per_group (`int`, *optional*, defaults to 2):
            Number of conv-based blocks per group.
-        hidden_size (`int`, defaults to 256):
+        hidden_size (`int`, *optional*, defaults to 256):
            Size of hidden dim for the blocks.
-        vocab_size (`int`, defaults to 8192):
+        vocab_size (`int`, *optional*, defaults to 8192):
            Size of the output vocabulary for the codebook.
        freeze (`bool`, defaults to `True`):
            Whether to freeze the weights of the model.

--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -176,7 +176,7 @@ class FlavaForPreTrainingOutput(ModelOutput):
            The output of the [`FlavaTextModel`].
        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
-        multimodal_masked_output (`BaseModelOutputWithPooling`, returned when `input_ids_masked` and `pixel_values` are present):
+        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The output of the [`FlavaMultimodalModel`].

        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):

--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -651,7 +651,7 @@ class FNetForPreTraining(FNetPreTrainedModel):

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -501,9 +501,9 @@ class FSMTEncoder(nn.Module):
            BaseModelOutput or Tuple comprised of:

                - **x** (`torch.Tensor`): the last encoder layer's output of shape *(src_len, batch, embed_dim)*
-                - **encoder_states** (`Tuple(torch.FloatTensor`)): all intermediate hidden states of shape *(src_len,
+                - **encoder_states** (`Tuple(torch.FloatTensor)`): all intermediate hidden states of shape *(src_len,
                  batch, embed_dim)*. Only populated if *output_hidden_states:* is True.
-                - **all_attentions** (`Tuple(torch.FloatTensor`)): Attention weights for each layer.
+                - **all_attentions** (`Tuple(torch.FloatTensor)`): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """
        # check attention mask and invert

--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -839,7 +839,7 @@ PARALLELIZE_DOCSTRING = r"""
    it will evenly distribute blocks across all devices.

    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the

--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -587,7 +587,7 @@ PARALLELIZE_DOCSTRING = r"""
    across all devices.

    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the GPT-J models have the