Fix LED documentation (#17181)

* Fix markdown code block * Use consistent spelling for self-attention * Fix typos and phrasing * Fix code style

Fix LED documentation (#17181)
* Fix markdown code block * Use consistent spelling for self-attention * Fix typos and phrasing * Fix code style
c76afa51 · Manuel R. Ciosici · GitHub · edcc66d2 · c76afa51 · c76afa51
Unverified Commit c76afa51 authored May 11, 2022 by Manuel R. Ciosici Committed by GitHub May 11, 2022
3 changed files
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -86,18 +86,17 @@ class LEDConfig(PretrainedConfig):
    Example:
    ```python
+    >>> from transformers import LEDModel, LEDConfig
-    ```
+    >>> # Initializing a LED allenai/led-base-16384 style configuration
+    >>> configuration = LEDConfig()
-        >>> from transformers import LEDModel, LEDConfig
+    >>> # Initializing a model from the allenai/led-base-16384 style configuration
+    >>> model = LEDModel(configuration)
-        >>> # Initializing a LED allenai/led-base-16384 style configuration >>> configuration = LEDConfig()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
-        >>> # Initializing a model from the allenai/led-base-16384 style configuration >>> model =
+    ```"""
-        LEDModel(configuration)
-        >>> # Accessing the model configuration >>> configuration = model.config
-    """
    model_type = "led"
    attribute_map = {
        "num_attention_heads": "encoder_attention_heads",

--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -1007,7 +1007,7 @@ class LEDDecoderLayer(nn.Module):
        """
        residual = hidden_states
-        # Self Attention
+        # Self-Attention
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # add present self-attn cache to positions 1,2 of present_key_value tuple
@@ -1437,13 +1437,11 @@ class LEDSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
 LED_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    This model inherits from [`PreTrainedModel`]. See the superclass documentation for the generic methods the library
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads etc.)
-    etc.)
    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for general usage and behavior.
-    and behavior.
    Parameters:
        config ([`LEDConfig`]):
@@ -1595,7 +1593,7 @@ LED_INPUTS_DOCSTRING = r"""
 class LEDEncoder(LEDPreTrainedModel):
    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
    [`LEDEncoderLayer`].
    Args:
@@ -1643,7 +1641,7 @@ class LEDEncoder(LEDPreTrainedModel):
        self.post_init()
    def _merge_to_attention_mask(self, attention_mask: torch.Tensor, global_attention_mask: torch.Tensor):
-        # longformer self attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
+        # longformer self-attention expects attention mask to have 0 (no attn), 1 (local attn), 2 (global attn)
        # (global_attention_mask + 1) => 1 for local attention, 2 for global attention
        # => final attention_mask => 0 for no attention, 1 for local attention 2 for global attention
        if attention_mask is not None:

--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1238,7 +1238,7 @@ class TFLEDDecoderLayer(tf.keras.layers.Layer):
        """
        residual = hidden_states
-        # Self Attention
+        # Self-Attention
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # add present self-attn cache to positions 1,2 of present_key_value tuple
@@ -1612,7 +1612,7 @@ LED_INPUTS_DOCSTRING = r"""
 class TFLEDEncoder(tf.keras.layers.Layer):
    config_class = LEDConfig
    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    Transformer encoder consisting of *config.encoder_layers* self-attention layers. Each layer is a
    [`TFLEDEncoderLayer`].
    Args: