[docstring] Fix docstring for speech-to-text config (#26883)

* Fix docstring for speech-to-text config * Refactor doc line len <= 119 char * Remove Speech2TextConfig from OBJECTS_TO_IGNORE * Fix Speech2TextConfig doc str * Fix Speech2TextConfig doc using doc-builder * Refactor Speech2TextConfig doc

[docstring] Fix docstring for speech-to-text config (#26883)
* Fix docstring for speech-to-text config * Refactor doc line len <= 119 char * Remove Speech2TextConfig from OBJECTS_TO_IGNORE * Fix Speech2TextConfig doc str * Fix Speech2TextConfig doc using doc-builder * Refactor Speech2TextConfig doc
929134bf · Adam Ross · GitHub · 08a2edfc · 929134bf · 929134bf
Unverified Commit 929134bf authored Oct 20, 2023 by Adam Ross Committed by GitHub Oct 20, 2023
Showing with 34 additions and 23 deletions

src/transformers/models/speech_to_text/configuration_speech_to_text.py ...ers/models/speech_to_text/configuration_speech_to_text.py +34 -22

utils/check_docstrings.py utils/check_docstrings.py +0 -1

No files found.
--- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -30,7 +30,7 @@ SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class Speech2TextConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate a
    Speech2Text model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Speech2Text
    [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.
@@ -40,26 +40,36 @@ class Speech2TextConfig(PretrainedConfig):
    Args:
-        vocab_size (`int`, *optional*, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 10000):
            Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Speech2TextModel`]
-        d_model (`int`, *optional*, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
        encoder_layers (`int`, *optional*, defaults to 12):
            Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 12):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
-            Number of decoder layers.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
-        encoder_attention_heads (`int`, *optional*, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 4):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 16):
+        decoder_layers (`int`, *optional*, defaults to 6):
-            Number of attention heads for each attention layer in the Transformer decoder.
+            Number of decoder layers.
-        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+        decoder_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for
+            more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for
+            more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is set up as an encoder-decoder architecture for sequence-to-sequence tasks.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers and the pooler layer.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -68,18 +78,20 @@ class Speech2TextConfig(PretrainedConfig):
            The dropout ratio for activations inside the fully connected layer.
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+        decoder_start_token_id (`int`, *optional*, defaults to 2):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            The initial token ID of the decoder when decoding sequences.
-            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
-        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            Whether the embeddings are scaled by the square root of `d_model`.
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+        pad_token_id (`int`, *optional*, defaults to 1):
-            for more details.
+            Padding token id.
-        use_cache (`bool`, *optional*, defaults to `True`):
+        bos_token_id (`int`, *optional*, defaults to 0):
-            Whether or not the model should return the last key/values attentions (not used by all models).
+            The id of the beginning-of-sequence token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the end-of-sequence token.
        max_source_positions (`int`, *optional*, defaults to 6000):
            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
        max_target_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            The maximum sequence length that this model might ever be used with. Typically, set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        num_conv_layers (`int`, *optional*, defaults to 2):
            Number of 1D convolutional layers in the conv module.

--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -467,7 +467,6 @@ OBJECTS_TO_IGNORE = [
    "SpecialTokensMixin",
    "Speech2Text2Config",
    "Speech2Text2Tokenizer",
-    "Speech2TextConfig",
    "Speech2TextTokenizer",
    "SpeechEncoderDecoderModel",
    "SpeechT5Config",