Docs: formatting nits (#32247)

* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

Docs: formatting nits (#32247)
* doc formatting nits * ignore non-autodocs * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/esm/modeling_esm.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * make fixup --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
e68ec18c · Joao Gante · GitHub · 2fbbcf50 · e68ec18c · e68ec18c
Unverified Commit e68ec18c authored Jul 30, 2024 by Joao Gante Committed by GitHub Jul 30, 2024
20 changed files
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -892,7 +892,7 @@ class IBertForMaskedLM(IBertPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/idefics/configuration_idefics.py
+++ b/src/transformers/models/idefics/configuration_idefics.py
@@ -165,7 +165,7 @@ class IdeficsConfig(PretrainedConfig):
    documentation from [`PretrainedConfig`] for more information.

    Args:
-        additional_vocab_size (`int`, *optional`, defaults to 0):
+        additional_vocab_size (`int`, *optional*, defaults to 0):
            Additional vocabulary size of the model, typically for the special "<img>" token. Additional vocab tokens
            are always trainable whereas regular vocab tokens can be frozen or not.
        vocab_size (`int`, *optional*, defaults to 32000):

--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -97,7 +97,7 @@ def load_balancing_loss_func(
        router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):

--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -69,7 +69,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):

--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -133,7 +133,7 @@ class Kosmos2Processor(ProcessorMixin):
        Args:
            bboxes (`Union[List[Tuple[int]], List[Tuple[float]], List[List[Tuple[int]]], List[List[Tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
-            num_image_tokens (`int`, defaults to 64):
+            num_image_tokens (`int`, *optional* defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):

--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -79,7 +79,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
    Calculate the number of patches after the preprocessing for images of any resolution.

    Args:
-        image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
            The size of the input image in the format (height, width). ?
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list

--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -85,7 +85,7 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
    Calculate the number of patches after the preprocessing for images of any resolution.

    Args:
-        image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):
+        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
            The size of the input image in the format (height, width). ?
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list

--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1790,7 +1790,7 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -1810,7 +1810,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the
                cross(masked)-attention of the decoder.
-            feature_size_list (`List[torch.Size]` ):
+            feature_size_list (`List[torch.Size]`):
                This is a list containing shapes (height & width) of multi-scale features from the Pixel Decoder.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under

--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1049,7 +1049,7 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.

        Returns:

--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -84,7 +84,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):

--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -67,7 +67,7 @@ PARALLELIZE_DOCSTRING = r"""
    it will evenly distribute blocks across all devices.

    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
+        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the mt5 models have the

--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -1160,7 +1160,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
        Args:
            outputs ([`OneFormerForUniversalSegmentationOutput`]):
                The outputs from [`OneFormerForUniversalSegmentationOutput`].
-            task_type (`str`, *optional)*, defaults to "instance"):
+            task_type (`str`, *optional*, defaults to "instance"):
                The post processing depends on the task token input. If the `task_type` is "panoptic", we need to
                ignore the stuff predictions.
            is_demo (`bool`, *optional)*, defaults to `True`):

--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -117,7 +117,7 @@ def _preprocess_resize_output_shape(image, output_shape):
            channels is preserved.

    Returns
-        image (`np.ndarray):
+        image (`np.ndarray`):
            The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
            input.ndim`.
        output_shape (`Tuple`):

--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@@ -162,7 +162,7 @@ class PatchTSMixerNormLayer(nn.Module):
    """Normalization block

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -234,7 +234,7 @@ class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
    """This module mixes the features in the channel dimension.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -441,7 +441,7 @@ class PatchMixerBlock(nn.Module):
    """This module mixes the patch dimension.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -510,7 +510,7 @@ class FeatureMixerBlock(nn.Module):
    """This module mixes the hidden feature dimension.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    """
@@ -556,7 +556,7 @@ class PatchTSMixerLayer(nn.Module):
    The `PatchTSMixer` layer that does all three kinds of mixing.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    """
@@ -593,7 +593,7 @@ class PatchTSMixerBlock(nn.Module):
    """The main computing framework of the `PatchTSMixer` model.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -634,7 +634,8 @@ class PatchTSMixerForPredictionHead(nn.Module):
    """Prediction Head for Forecasting

    Args:
-        config (`PatchTSMixerConfig`, *required*): Configuration.
+        config (`PatchTSMixerConfig`):
+            Configuration.
    """

    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@@ -689,8 +690,8 @@ class PatchTSMixerLinearHead(nn.Module):
    """Linear head for Classification and Regression.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
-
+        config (`PatchTSMixerConfig`):
+            Configuration.
    """

    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
@@ -785,7 +786,7 @@ class PatchTSMixerPretrainHead(nn.Module):
    """Pretraining head.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -1189,7 +1190,7 @@ class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
    Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.
    """

@@ -1411,7 +1412,7 @@ class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for mask pretraining.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
@@ -1593,7 +1594,7 @@ class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for forecasting application.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
@@ -1826,7 +1827,7 @@ class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for classification application.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
@@ -1997,7 +1998,7 @@ class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
    `PatchTSMixer` for regression application.

    Args:
-        config (`PatchTSMixerConfig`, *required*):
+        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:

--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -258,7 +258,7 @@ class PersimmonAttention(nn.Module):
        storage as `fused_qkv`

        Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]

--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -75,7 +75,7 @@ def load_balancing_loss_func(
        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
-        attention_mask (`torch.Tensor`, None):
+        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.
        num_experts (`int`, *optional*):

--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -792,7 +792,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
             Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:
@@ -1261,7 +1261,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `torch.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:

--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -886,7 +886,7 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:
@@ -1400,7 +1400,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
        reduce_loss (`bool`, *optional*):
            Only relevant if `labels` is passed. If `True`, the NLL loss is reduced using the `tf.Tensor.sum`
            operation.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Legacy dictionary, which is required so that model can use *generate()* function.

        Returns:

--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -1073,7 +1073,7 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict