Docstring check (#26052)

* Fix number of minimal calls to the Hub with peft integration * Alternate design * And this way? * Revert * Nits to fix * Add util * Print when changes are made * Add list to ignore * Add more rules * Manual fixes * deal with kwargs * deal with enum defaults * avoid many digits for floats * Manual fixes * Fix regex * Fix regex * Auto fix * Style * Apply script * Add ignored list * Add check that templates are filled * Adding to CI checks * Add back semi-fix * Ignore more objects * More auto-fixes * Ignore missing objects * Remove temp semi-fix * Fixes * Update src/transformers/models/pvt/configuration_pvt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update utils/check_docstrings.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/utils/quantization_config.py Co-authored-by: Arthur <48595927+ArthurZucker...

Docstring check (#26052)
* Fix number of minimal calls to the Hub with peft integration * Alternate design * And this way? * Revert * Nits to fix * Add util * Print when changes are made * Add list to ignore * Add more rules * Manual fixes * deal with kwargs * deal with enum defaults * avoid many digits for floats * Manual fixes * Fix regex * Fix regex * Auto fix * Style * Apply script * Add ignored list * Add check that templates are filled * Adding to CI checks * Add back semi-fix * Ignore more objects * More auto-fixes * Ignore missing objects * Remove temp semi-fix * Fixes * Update src/transformers/models/pvt/configuration_pvt.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update utils/check_docstrings.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/utils/quantization_config.py Co-authored-by: Arthur <48595927+ArthurZucker...
03af4c42 · Sylvain Gugger · GitHub · 122b2657 · 03af4c42 · 03af4c42
Unverified Commit 03af4c42 authored Oct 04, 2023 by Sylvain Gugger Committed by GitHub Oct 04, 2023
20 changed files
--- a/src/transformers/models/cpmant/configuration_cpmant.py
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@@ -50,15 +50,17 @@ class CpmAntConfig(PretrainedConfig):
            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of layers of the Transformer encoder.
-        dropout_p (`float`, *optional*, defaults to 0.1):
+        dropout_p (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder.
        position_bias_num_buckets (`int`, *optional*, defaults to 512):
            The number of position_bias buckets.
        position_bias_max_distance (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        eps (`float`, *optional*, defaults to 1e-6):
+        eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
+        init_std (`float`, *optional*, defaults to 1.0):
+            Initialize parameters with std = init_std.
        prompt_types (`int`, *optional*, defaults to 32):
            The type of prompt.
        prompt_length (`int`, *optional*, defaults to 32):
@@ -67,8 +69,6 @@ class CpmAntConfig(PretrainedConfig):
            The type of segment.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether to use cache.
-        init_std (`float`, *optional*, defaults to 1.0):
-            Initialize parameters with std = init_std.

    Example:


--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -54,7 +54,7 @@ class CTRLConfig(PretrainedConfig):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (`int`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
            The epsilon to use in the layer normalization layers
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -99,9 +99,9 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
    refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
            Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
            Path to the merges file.
        tokenizer_file (`str`, *optional*):
            The path to a tokenizer file to use instead of the vocab file.

--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -58,23 +58,23 @@ class DeiTConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to `224`):
+        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to `16`):
+        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to `3`):
+        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
-        encoder_stride (`int`, `optional`, defaults to 16):
+        encoder_stride (`int`, *optional*, defaults to 16):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.

    Example:

--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -52,19 +52,19 @@ class DeiTImageProcessor(BaseImageProcessor):
            `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+        resample (`PILImageResampling` filter, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.

--- a/src/transformers/models/deprecated/mctct/configuration_mctct.py
+++ b/src/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -53,7 +53,7 @@ class MCTCTConfig(PretrainedConfig):
            Dimensions of each attention head for each attention layer in the Transformer encoder.
        max_position_embeddings (`int`, *optional*, defaults to 920):
            The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        layerdrop (`float`, *optional*, defaults to 0.3):
            The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
@@ -63,9 +63,9 @@ class MCTCTConfig(PretrainedConfig):
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
            The dropout ratio for the attention probabilities.
        pad_token_id (`int`, *optional*, defaults to 1):
            The tokenizer index of the pad token.
@@ -80,17 +80,17 @@ class MCTCTConfig(PretrainedConfig):
            The probability of randomly dropping the `Conv1dSubsampler` layer during training.
        num_conv_layers (`int`, *optional*, defaults to 1):
            Number of convolution layers before applying transformer encoder layers.
-        conv_kernel (`List[int]`, *optional*, defaults to `[7]`):
+        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
            The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
            to `num_conv_layers`.
-        conv_stride (`List[int]`, *optional*, defaults to `[3]`):
+        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
            The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
            to `num_conv_layers`.
        input_feat_per_channel (`int`, *optional*, defaults to 80):
            Feature dimensions of the channels of the input to the Conv1D layer.
        input_channels (`int`, *optional*, defaults to 1):
            Number of input channels of the input to the Conv1D layer.
-        conv_channels (`List[int]`, *optional*, defaults to None):
+        conv_channels (`List[int]`, *optional*):
            Channel sizes of intermediate Conv1D layers.
        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an

--- a/src/transformers/models/deprecated/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -57,9 +57,9 @@ class VanConfig(PretrainedConfig):
            `"selu"` and `"gelu_new"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
-        layer_scale_init_value (`float`, *optional*, defaults to 1e-2):
+        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
            The initial value for layer scaling.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            The dropout probability for stochastic depth.

--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -44,9 +44,9 @@ class DinatConfig(BackboneConfigMixin, PretrainedConfig):
            The number of input channels.
        embed_dim (`int`, *optional*, defaults to 64):
            Dimensionality of patch embedding.
-        depths (`List[int]`, *optional*, defaults to `[2, 2, 6, 2]`):
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
            Number of layers in each level of the encoder.
-        num_heads (`List[int]`, *optional*, defaults to `[3, 6, 12, 24]`):
+        num_heads (`List[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
            Number of attention heads in each layer of the Transformer encoder.
        kernel_size (`int`, *optional*, defaults to 7):
            Neighborhood Attention kernel size.
@@ -67,7 +67,7 @@ class DinatConfig(BackboneConfigMixin, PretrainedConfig):
            `"selu"` and `"gelu_new"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        layer_scale_init_value (`float`, *optional*, defaults to 0.0):
            The initial value for the layer scale. Disabled if <=0.

--- a/src/transformers/models/dinov2/configuration_dinov2.py
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -60,7 +60,7 @@ class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.

--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -45,15 +45,15 @@ class DonutSwinConfig(PretrainedConfig):
            The number of input channels.
        embed_dim (`int`, *optional*, defaults to 96):
            Dimensionality of patch embedding.
-        depths (`list(int)`, *optional*, defaults to [2, 2, 6, 2]):
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
            Depth of each layer in the Transformer encoder.
-        num_heads (`list(int)`, *optional*, defaults to [3, 6, 12, 24]):
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
            Number of attention heads in each layer of the Transformer encoder.
        window_size (`int`, *optional*, defaults to 7):
            Size of windows.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of MLP hidden dimensionality to embedding dimensionality.
-        qkv_bias (`bool`, *optional*, defaults to True):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether or not a learnable bias should be added to the queries, keys and values.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings and encoder.
@@ -64,11 +64,11 @@ class DonutSwinConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
            `"selu"` and `"gelu_new"` are supported.
-        use_absolute_embeddings (`bool`, *optional*, defaults to False):
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
            Whether or not to add absolute position embeddings to the patch embeddings.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.

    Example:

--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -32,9 +32,9 @@ class DonutProcessor(ProcessorMixin):
    [`~DonutProcessor.decode`] for more information.

    Args:
-        image_processor ([`DonutImageProcessor`]):
+        image_processor ([`DonutImageProcessor`], *optional*):
            An instance of [`DonutImageProcessor`]. The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]):
+        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]

--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -52,9 +52,9 @@ class DPTConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -66,6 +66,8 @@ class DPTConfig(PretrainedConfig):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
+        is_hybrid (`bool`, *optional*, defaults to `False`):
+            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
        backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
@@ -79,11 +81,9 @@ class DPTConfig(PretrainedConfig):
            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
              projecting the
            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
-        is_hybrid (`bool`, *optional*, defaults to `False`):
-            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
            The up/downsampling factors of the reassemble layers.
-        neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]):
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
            The hidden sizes to project to for the feature maps of the backbone.
        fusion_hidden_size (`int`, *optional*, defaults to 256):
            The number of channels before fusion.

--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -100,14 +100,14 @@ class DPTImageProcessor(BaseImageProcessor):
            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
        keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
            If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
            be overidden by `keep_aspect_ratio` in `preprocess`.
        ensure_multiple_of (`int`, *optional*, defaults to 1):
            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
            by `ensure_multiple_of` in `preprocess`.
-        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
-            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
            `preprocess`.

--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -52,22 +52,22 @@ class EfficientNetImageProcessor(BaseImageProcessor):
            `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 346, "width": 346}`):
            Size of the image after `resize`. Can be overridden by `size` in `preprocess`.
-        resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.NEAREST`):
+        resample (`PILImageResampling` filter, *optional*, defaults to 0):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
        do_center_crop (`bool`, *optional*, defaults to `False`):
            Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image
            is padded with 0's and then center cropped. Can be overridden by `do_center_crop` in `preprocess`.
        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 289, "width": 289}`):
            Desired output size when applying center-cropping. Can be overridden by `crop_size` in `preprocess`.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
-            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        rescale_offset (`bool`, *optional*, defaults to `False`):
            Whether to rescale the image between [-scale_range, scale_range] instead of [0, scale_range]. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.

--- a/src/transformers/models/falcon/configuration_falcon.py
+++ b/src/transformers/models/falcon/configuration_falcon.py
@@ -46,13 +46,13 @@ class FalconConfig(PretrainedConfig):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 71):
            Number of attention heads for each attention layer in the Transformer encoder.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether the model should return the last key/values attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for MLP layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):

--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -207,7 +207,7 @@ class FlaubertTokenizer(PreTrainedTokenizer):
        mask_token (`str`, *optional*, defaults to `"<special1>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
            List of additional special tokens.
        lang2id (`Dict[str, int]`, *optional*):
            Dictionary mapping languages string identifiers to their IDs.

--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -52,9 +52,9 @@ class FlavaImageConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -291,7 +291,7 @@ class FlavaMultimodalConfig(PretrainedConfig):
    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
@@ -300,9 +300,9 @@ class FlavaMultimodalConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -33,8 +33,8 @@ class FlavaProcessor(ProcessorMixin):
    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.

    Args:
-        image_processor ([`FlavaImageProcessor`]): The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`]): The tokenizer is a required input.
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "FlavaImageProcessor"

--- a/src/transformers/models/focalnet/configuration_focalnet.py
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -67,7 +67,7 @@ class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
            Stochastic depth rate.
        use_layerscale (`bool`, *optional*, defaults to `False`):
            Whether to use layer scale in the encoder.
-        layerscale_value (`float`, *optional*, defaults to 1e-4):
+        layerscale_value (`float`, *optional*, defaults to 0.0001):
            The initial value of the layer scale.
        use_post_layernorm (`bool`, *optional*, defaults to `False`):
            Whether to use post layer normalization in the encoder.
@@ -77,9 +77,9 @@ class FocalNetConfig(BackboneConfigMixin, PretrainedConfig):
            Whether to normalize the modulator.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
-        encoder_stride (`int`, `optional`, defaults to 32):
+        encoder_stride (`int`, *optional*, defaults to 32):
            Factor to increase the spatial resolution by in the decoder head for masked image modeling.
        out_features (`List[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.

--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -146,13 +146,13 @@ class FSMTTokenizer(PreTrainedTokenizer):
    this superclass for more information regarding those methods.

    Args:
-        langs (`List[str]`):
+        langs (`List[str]`, *optional*):
            A list of two languages to translate from and to, for instance `["en", "ru"]`.
-        src_vocab_file (`str`):
+        src_vocab_file (`str`, *optional*):
            File containing the vocabulary for the source language.
-        tgt_vocab_file (`st`):
+        tgt_vocab_file (`st`, *optional*):
            File containing the vocabulary for the target language.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
            File containing the merges.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.