Doc styler v2 (#14950)

* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring

Doc styler v2 (#14950)
* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring
87e6e4fe · Sylvain Gugger · GitHub · c1138273 · 87e6e4fe · 87e6e4fe
Unverified Commit 87e6e4fe authored Dec 27, 2021 by Sylvain Gugger Committed by GitHub Dec 27, 2021
20 changed files
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -118,8 +118,7 @@ class BeamSearchState:
 class FlaxGenerationMixin:
    """
-    A class containing all of the functions supporting generation, to be used as a mixin in
+    A class containing all of the functions supporting generation, to be used as a mixin in [`FlaxPreTrainedModel`].
-    [`FlaxPreTrainedModel`].
    """
    @staticmethod
@@ -148,8 +147,7 @@ class FlaxGenerationMixin:
    def _adapt_logits_for_beam_search(self, logits):
        """
        This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
-        search behavior. Note that the only model that overwrites this method is
+        search behavior. Note that the only model that overwrites this method is [`~transformes.FlaxMarianMTModel`].
-        [`~transformes.FlaxMarianMTModel`].
        """
        return logits
@@ -181,11 +179,12 @@ class FlaxGenerationMixin:
        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
        and, multinomial sampling.
-        Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same
+        Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same name
-        name inside the [`PretrainedConfig`] of the model. The default values indicated are the
+        inside the [`PretrainedConfig`] of the model. The default values indicated are the default values of those
-        default values of those config.
+        config.
-        Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
+        Most of these parameters are explained in more detail in [this blog
+        post](https://huggingface.co/blog/how-to-generate).
        Parameters:
@@ -200,8 +199,8 @@ class FlaxGenerationMixin:
            top_k (`int`, *optional*, defaults to 50):
                The number of highest probability vocabulary tokens to keep for top-k-filtering.
            top_p (`float`, *optional*, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher
-                higher are kept for generation.
+                are kept for generation.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            bos_token_id (`int`, *optional*):
@@ -213,8 +212,8 @@ class FlaxGenerationMixin:
            decoder_start_token_id (`int`, *optional*):
                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
            trace (`bool`, *optional*, defaults to `True`):
-                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to
+                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to a
-                a considerably slower runtime.
+                considerably slower runtime.
            params (`Dict[str, jnp.ndarray]`, *optional*):
                Optionally the model parameters can be passed. Can be useful for parallelized generation.
            model_kwargs:
@@ -327,8 +326,8 @@ class FlaxGenerationMixin:
        self, top_k: int = None, top_p: float = None, temperature: float = None
    ) -> FlaxLogitsProcessorList:
        """
-        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsWarper`]
-        [`FlaxLogitsWarper`] instances used for multinomial sampling.
+        instances used for multinomial sampling.
        """
        # init warp parameters
@@ -359,8 +358,8 @@ class FlaxGenerationMixin:
        forced_eos_token_id: int,
    ) -> FlaxLogitsProcessorList:
        """
-        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant [`FlaxLogitsProcessor`]
-        [`FlaxLogitsProcessor`] instances used to modify the scores of the language model head.
+        instances used to modify the scores of the language model head.
        """
        processors = FlaxLogitsProcessorList()

--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -33,9 +33,8 @@ LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
@@ -74,10 +73,9 @@ class LogitsWarper(ABC):
 class LogitsProcessorList(list):
    """
-    This class can be used to create a list of [`LogitsProcessor`] or
+    This class can be used to create a list of [`LogitsProcessor`] or [`LogitsWarper`] to subsequently process a
-    [`LogitsWarper`] to subsequently process a `scores` input tensor. This class inherits from
+    `scores` input tensor. This class inherits from list and adds a specific *__call__* method to apply each
-    list and adds a specific *__call__* method to apply each [`LogitsProcessor`] or
+    [`LogitsProcessor`] or [`LogitsWarper`] to the inputs.
-    [`LogitsWarper`] to the inputs.
    """
    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -150,7 +148,8 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    Args:
        repetition_penalty (`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
+            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    """
    def __init__(self, penalty: float):
@@ -171,13 +170,12 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
 class TopPLogitsWarper(LogitsWarper):
    """
-    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
-    prob_cut_off.
    Args:
        top_p (`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
+            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are kept
-            kept for generation.
+            for generation.
        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
            All filtered values will be set to this float value.
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
@@ -278,7 +276,8 @@ def _calc_banned_ngram_tokens(
 class NoRepeatNGramLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+    [`LogitsProcessor`] that enforces no repetition of n-grams. See
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
    Args:
        ngram_size (`int`):
@@ -303,8 +302,8 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
 class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids.
+    [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids. See
-    See [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
+    [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
    Args:
        encoder_ngram_size (`int`):
@@ -471,16 +470,15 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned
+    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned constrained
-    constrained generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more
+    generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
-    information.
    Args:
        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
            This function constraints the beam search to allowed tokens only at each step. This function takes 2
-            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed
+            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
-            tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and
+            next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
-            the batch ID `batch_id`.
+            `batch_id`.
    """
    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
@@ -498,20 +496,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
 class HammingDiversityLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only
+    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only effective for
-    effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse
+    [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
-    Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+    Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    Args:
        diversity_penalty (`float`):
            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
        num_beams (`int`):
-            Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for
+            Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more
-            more details.
+            details.
        num_beam_groups (`int`):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+            See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
    """
    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
@@ -579,8 +577,7 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
 class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that enforces the specified token as the last generated token when
+    [`LogitsProcessor`] that enforces the specified token as the last generated token when `max_length` is reached.
-    `max_length` is reached.
    Args:
        max_length (`int`):
@@ -604,9 +601,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
 class InfNanRemoveLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation
+    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. Note that using
-    method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
+    the logits processor should only be used if necessary since it can slow down the generation method. `max_length` is
-    generation method. `max_length` is reached.
+    reached.
    """
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:

--- a/src/transformers/generation_stopping_criteria.py
+++ b/src/transformers/generation_stopping_criteria.py
@@ -14,9 +14,8 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
@@ -41,8 +40,8 @@ class StoppingCriteria(ABC):
 class MaxLengthCriteria(StoppingCriteria):
    """
-    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`.
+    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`. Keep
-    Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
+    in mind for decoder-only type of transformers, this will include the initial prompted tokens.
    Args:
        max_length (`int`):
@@ -59,9 +58,9 @@ class MaxLengthCriteria(StoppingCriteria):
 class MaxNewTokensCriteria(StoppingCriteria):
    """
-    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`.
+    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`. Keep in
-    Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
+    mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is very
-    very close to `MaxLengthCriteria` but ignores the number of initial tokens.
+    close to `MaxLengthCriteria` but ignores the number of initial tokens.
    Args:
        start_length (`int`):

--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -87,8 +87,8 @@ class ImageFeatureExtractionMixin:
    def to_pil_image(self, image, rescale=None):
        """
-        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
-        axis if needed.
+        needed.
        Args:
            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
@@ -125,8 +125,7 @@ class ImageFeatureExtractionMixin:
                The image to convert to a NumPy array.
            rescale (`bool`, *optional*):
                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
-                default to `True` if the image is a PIL Image or an array/tensor of integers, `False`
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
-                otherwise.
            channel_first (`bool`, *optional*, defaults to `True`):
                Whether or not to permute the dimensions of the image to put the channel dimension first.
        """
@@ -151,8 +150,8 @@ class ImageFeatureExtractionMixin:
    def normalize(self, image, mean, std):
        """
-        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
-        `image` to a NumPy array if it's a PIL Image.
+        if it's a PIL Image.
        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
@@ -210,8 +209,8 @@ class ImageFeatureExtractionMixin:
    def center_crop(self, image, size):
        """
-        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
-        the size given, it will be padded (so the returned result has the size asked).
+        size given, it will be padded (so the returned result has the size asked).
        Args:
            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):

--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -479,15 +479,17 @@ class WandbCallback(TrainerCallback):
        """
        Setup the optional Weights & Biases (*wandb*) integration.
-        One can subclass and override this method to customize the setup if needed. Find more information [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment variables:
+        One can subclass and override this method to customize the setup if needed. Find more information
+        [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment
+        variables:
        Environment:
            WANDB_LOG_MODEL (`bool`, *optional*, defaults to `False`):
                Whether or not to log model as artifact at the end of training. Use along with
                *TrainingArguments.load_best_model_at_end* to upload best model.
            WANDB_WATCH (`str`, *optional* defaults to `"gradients"`):
-                Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient
+                Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient logging or `"all"` to
-                logging or `"all"` to log gradients and parameters.
+                log gradients and parameters.
            WANDB_PROJECT (`str`, *optional*, defaults to `"huggingface"`):
                Set this to a custom string to store results in a different project.
            WANDB_DISABLED (`bool`, *optional*, defaults to `False`):
@@ -608,7 +610,8 @@ class CometCallback(TrainerCallback):
                Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
                "FALSE". Defaults to "TRUE".
-        For a number of configurable items in the environment, see [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
+        For a number of configurable items in the environment, see
+        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
        """
        self._initialized = True
        log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
@@ -705,8 +708,8 @@ class MLflowCallback(TrainerCallback):
                Whether to use MLflow .log_artifact() facility to log artifacts.
                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
-                whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote
+                whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote artifact storage. Using it
-                artifact storage. Using it without a remote storage will just copy the files to your artifact location.
+                without a remote storage will just copy the files to your artifact location.
        """
        log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
        if log_artifacts in {"TRUE", "1"}:

--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -27,9 +27,8 @@ class KerasMetricCallback(Callback):
    `eval_dataset` before being passed to the `metric_fn` in `np.ndarray` format. The `metric_fn` should compute
    metrics and return a dict mapping metric names to metric values.
-    We provide an example of a suitable metric_fn that computes ROUGE scores for a summarization model below.
+    We provide an example of a suitable metric_fn that computes ROUGE scores for a summarization model below. Note that
-    Note that this example skips some post-processing for readability and simplicity, and should probably
+    this example skips some post-processing for readability and simplicity, and should probably not be used as-is!
-    not be used as-is!
    ```py
    from datasets import load_metric

--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -134,8 +134,8 @@ class ModelCard:
                - a string, the *model id* of a pretrained model card hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                  user or organization name, like `dbmdz/bert-base-german-cased`.
-                - a path to a *directory* containing a model card file saved using the
+                - a path to a *directory* containing a model card file saved using the [`~ModelCard.save_pretrained`]
-                  [`~ModelCard.save_pretrained`] method, e.g.: `./my_model_directory/`.
+                  method, e.g.: `./my_model_directory/`.
                - a path or url to a saved model card JSON *file*, e.g.: `./my_model_directory/modelcard.json`.
            cache_dir: (*optional*) string:

--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -67,17 +67,17 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
    r"""
    Base class for all models.
-    [`FlaxPreTrainedModel`] takes care of storing the configuration of the models and handles
+    [`FlaxPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
-    methods for loading, downloading and saving models.
+    downloading and saving models.
    Class attributes (overridden by derived classes):
-        - **config_class** ([`PretrainedConfig`]) -- A subclass of
+        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
-          [`PretrainedConfig`] to use as configuration class for this model architecture.
+          for this model architecture.
-        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
+        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
-          derived classes of the same architecture adding modules on top of the base model.
+          classes of the same architecture adding modules on top of the base model.
-        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
-          NLP models, `pixel_values` for vision models and `input_values` for speech models).
+          models, `pixel_values` for vision models and `input_values` for speech models).
    """
    config_class = None
    base_model_prefix = ""
@@ -183,8 +183,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
    def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
        r"""
-        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not
+        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast
-        cast the `params` in place.
+        the `params` in place.
        This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full
        half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
@@ -193,8 +193,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
            params (`Union[Dict, FrozenDict]`):
                A `PyTree` of model parameters.
            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
-                params you want to cast, and should be `False` for those you want to skip.
+                you want to cast, and should be `False` for those you want to skip.
        Examples:
@@ -218,15 +218,14 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
    def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
        r"""
        Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
-        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in
+        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
-        place.
        Arguments:
            params (`Union[Dict, FrozenDict]`):
                A `PyTree` of model parameters.
            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
-                params you want to cast, and should be `False` for those you want to skip
+                you want to cast, and should be `False` for those you want to skip
        Examples:
@@ -244,8 +243,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
    def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
        r"""
-        Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not
+        Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
-        cast the `params` in place.
+        `params` in place.
        This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
        half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
@@ -254,8 +253,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
            params (`Union[Dict, FrozenDict]`):
                A `PyTree` of model parameters.
            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params
-                params you want to cast, and should be `False` for those you want to skip
+                you want to cast, and should be `False` for those you want to skip
        Examples:
@@ -300,15 +299,15 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this
+                    - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this case,
-                      case, `from_pt` should be set to `True`.
+                      `from_pt` should be set to `True`.
            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
-                GPUs) and `jax.numpy.bfloat16` (on TPUs).
+                `jax.numpy.bfloat16` (on TPUs).
                This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
                specified all the computation will be performed with the given `dtype`.
@@ -316,8 +315,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                **Note that this only specifies the dtype of the computation and does not influence the dtype of model
                parameters.**
-                If you wish to change the dtype of the model parameters, see
+                If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
-                [`~FlaxPreTrainedModel.to_fp16`] and
                [`~FlaxPreTrainedModel.to_bf16`].
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
@@ -332,8 +330,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      by supplying the save directory.
+                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
@@ -353,7 +351,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            revision(`str`, *optional*, defaults to `"main"`):
@@ -369,10 +368,10 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      corresponds to a configuration attribute will be used to override said attribute with the
-                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      attribute will be passed to the underlying model's `__init__` function.
+                      will be passed to the underlying model's `__init__` function.
        Examples:
@@ -605,16 +604,14 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                <Tip warning={true}>
-                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
-                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
-                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
-                instead.
                </Tip>
            kwargs:
-                Additional key word arguments passed along to the
+                Additional key word arguments passed along to the [`~file_utils.PushToHubMixin.push_to_hub`] method.
-                [`~file_utils.PushToHubMixin.push_to_hub`] method.
        """
        if os.path.isfile(save_directory):
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")

--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -102,15 +102,15 @@ def keras_serializable(cls):
    1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
       serialization time.
-    2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization
+    2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
-       time) and convert it to a config object for the actual layer initializer.
+       convert it to a config object for the actual layer initializer.
    3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
       need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
    Args:
        cls (a `tf.keras.layers.Layers subclass`):
-            Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to
+            Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
-            its initializer.
+            initializer.
    Returns:
        The same class object, with modifications for Keras deserialization.
@@ -648,20 +648,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
    r"""
    Base class for all TF models.
-    [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods
+    [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
-    for loading, downloading and saving models as well as a few methods common to all models to:
+    downloading and saving models as well as a few methods common to all models to:
        - resize the input embeddings,
        - prune heads in the self-attention heads.
    Class attributes (overridden by derived classes):
-        - **config_class** ([`PretrainedConfig`]) -- A subclass of
+        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
-          [`PretrainedConfig`] to use as configuration class for this model architecture.
+          for this model architecture.
-        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
+        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
-          derived classes of the same architecture adding modules on top of the base model.
+          classes of the same architecture adding modules on top of the base model.
-        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
-          NLP models, `pixel_values` for vision models and `input_values` for speech models).
+          models, `pixel_values` for vision models and `input_values` for speech models).
    """
    config_class = None
    base_model_prefix = ""
@@ -1104,9 +1104,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
        Arguments:
            new_num_tokens (`int`, *optional*):
                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`,
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
-                just returns a pointer to the input tokens `tf.Variable` module of the model without doing
+                returns a pointer to the input tokens `tf.Variable` module of the model without doing anything.
-                anything.
        Return:
            `tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
@@ -1234,8 +1233,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                vectors from the end. If not provided or `None`, just returns None
        Return:
-            `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
+            `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input
-            input ones.
+            ones.
        """
        new_lm_head_decoder = old_lm_head_decoder
        is_input_output_equals = tf.reduce_any(
@@ -1273,8 +1272,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                ``tf.Variable``` module of the model without doing anything.
        Return:
-            `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
+            `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is
-            `new_num_tokens` is `None`
+            `None`
        """
        old_embedding_dim = shape_list(old_embeddings)[1]
        init_range = getattr(self.config, "initializer_range", 0.02)
@@ -1297,9 +1296,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
        Arguments:
            heads_to_prune (`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (`int`) and associated values being the list of
+                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
-                heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
-                0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+                layer 1 and heads 2 and 3 on layer 2.
        """
        raise NotImplementedError
@@ -1322,16 +1321,14 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                <Tip warning={true}>
-                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with `save_directory`,
-                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                which requires `save_directory` to be a local clone of the repo you are pushing to if it's an existing
-                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                folder. Pass along `temp_dir=True` to use a temporary directory instead.
-                instead.
                </Tip>
            kwargs:
-                Additional key word arguments passed along to the
+                Additional key word arguments passed along to the [`~file_utils.PushToHubMixin.push_to_hub`] method.
-                [`~file_utils.PushToHubMixin.push_to_hub`] method.
        """
        if os.path.isfile(save_directory):
            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -1378,15 +1375,14 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      as `config` argument. This loading path is slower than converting the PyTorch model in a
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      TensorFlow model using the provided conversion scripts and loading the TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-                      afterwards.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
@@ -1402,8 +1398,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded
+                    - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      by supplying the save directory.
+                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            from_pt: (`bool`, *optional*, defaults to `False`):
@@ -1422,14 +1418,16 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
            resume_download (`bool`, *optional*, defaults to `False`):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
-            proxies: (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            proxies:
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`):
+                (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.,
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+                `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a
+                dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (e.g., not try doanloading the model).
            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `transformers-cli login` (stored in `~/.huggingface`).
            revision(`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -1447,10 +1445,10 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      corresponds to a configuration attribute will be used to override said attribute with the
-                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      attribute will be passed to the underlying model's `__init__` function.
+                      will be passed to the underlying model's `__init__` function.
        <Tip>
@@ -1782,19 +1780,20 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
                In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
            mode (`str`, defaults to `"embedding"`):
-               A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer
+               A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be
-               should be used as an embedding layer, the second one that the layer should be used as a linear decoder.
+               used as an embedding layer, the second one that the layer should be used as a linear decoder.
        Returns:
-            `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
+            `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length,
-            `[batch_size, length, embedding_size]`.
+            embedding_size]`.
            In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.
        Raises:
            ValueError: if `mode` is not valid.
-        Shared weights logic is adapted from [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
+        Shared weights logic is adapted from
+        [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
        """
        if mode == "embedding":
            return self._embedding(inputs)
@@ -1842,14 +1841,12 @@ class TFSequenceSummary(tf.keras.layers.Layer):
                - `"attn"` -- Not implemented now, use multi-head attention
            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
-              `config.num_labels` classes (otherwise to `config.hidden_size`).
+              (otherwise to `config.hidden_size`).
-            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
-              output, another string or `None` will add no activation.
+              another string or `None` will add no activation.
-            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
-              activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
-            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
-              activation.
        initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
        kwargs:

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ALBERT model configuration """
+""" ALBERT model configuration"""
 from collections import OrderedDict
 from typing import Mapping
@@ -35,19 +35,18 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class AlbertConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`AlbertModel`] or a
+    This is the configuration class to store the configuration of a [`AlbertModel`] or a [`TFAlbertModel`]. It is used
-    [`TFAlbertModel`]. It is used to instantiate an ALBERT model according to the specified
+    to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating
-    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    a configuration with the defaults will yield a similar configuration to that of the ALBERT
-    configuration to that of the ALBERT [xxlarge](https://huggingface.co/albert-xxlarge-v2) architecture.
+    [xxlarge](https://huggingface.co/albert-xxlarge-v2) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 30000):
            Vocabulary size of the ALBERT model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`AlbertModel`] or
+            `inputs_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`].
-            [`TFAlbertModel`].
        embedding_size (`int`, *optional*, defaults to 128):
            Dimensionality of vocabulary embeddings.
        hidden_size (`int`, *optional*, defaults to 4096):
@@ -63,8 +62,8 @@ class AlbertConfig(PretrainedConfig):
        inner_group_num (`int`, *optional*, defaults to 1):
            The number of inner repetition of attention and ffn.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
@@ -73,8 +72,7 @@ class AlbertConfig(PretrainedConfig):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            (e.g., 512 or 1024 or 2048).
        type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or
+            The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`].
-            [`TFAlbertModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -82,10 +80,11 @@ class AlbertConfig(PretrainedConfig):
        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for attached classifiers.
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
-            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
-            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
-            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
-            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
    Examples:

--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch ALBERT model. """
+"""PyTorch ALBERT model."""
 import math
 import os
@@ -529,12 +529,13 @@ class AlbertForPreTrainingOutput(ModelOutput):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            of shape `(batch_size, sequence_length, hidden_size)`.
+            shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -549,19 +550,18 @@ class AlbertForPreTrainingOutput(ModelOutput):
 ALBERT_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    pruning heads etc.)
+    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.
+    and behavior.
    Args:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 ALBERT_INPUTS_DOCSTRING = r"""
@@ -569,9 +569,8 @@ ALBERT_INPUTS_DOCSTRING = r"""
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`AlbertTokenizer`]. See
+            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
-            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            [`PreTrainedTokenizer.encode`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -582,14 +581,16 @@ ALBERT_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -599,9 +600,9 @@ ALBERT_INPUTS_DOCSTRING = r"""
            - 0 indicates the head is **masked**.
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert `input_ids` indices into associated
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            vectors than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -785,12 +786,13 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence
+            (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then
-            A, then sequence B), `1` indicates switched order (sequence B, then sequence A).
+            sequence B), `1` indicates switched order (sequence B, then sequence A).
        Returns:
@@ -935,8 +937,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1014,8 +1017,9 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1212,12 +1216,12 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1310,7 +1314,8 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see
            *input_ids* above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -65,12 +65,13 @@ class FlaxAlbertForPreTrainingOutput(ModelOutput):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
-            shape `(batch_size, sequence_length, hidden_size)`.
+            `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -84,12 +85,12 @@ class FlaxAlbertForPreTrainingOutput(ModelOutput):
 ALBERT_START_DOCSTRING = r"""
-    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading, saving and converting weights from
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
-    PyTorch models)
-    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. Use it as a regular Flax linen Module
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
-    and refer to the Flax documentation for all matter related to general usage and behavior.
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
    Finally, this model supports inherent JAX features such as:
@@ -101,11 +102,10 @@ ALBERT_START_DOCSTRING = r"""
    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
-            model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
-            GPUs) and `jax.numpy.bfloat16` (on TPUs).
+            `jax.numpy.bfloat16` (on TPUs).
            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.
@@ -113,8 +113,8 @@ ALBERT_START_DOCSTRING = r"""
            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**
-            If you wish to change the dtype of the model parameters, see
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
-            [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
+            [`~FlaxPreTrainedModel.to_bf16`].
 """
 ALBERT_INPUTS_DOCSTRING = r"""
@@ -122,9 +122,8 @@ ALBERT_INPUTS_DOCSTRING = r"""
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`AlbertTokenizer`]. See
+            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
@@ -135,14 +134,16 @@ ALBERT_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
@@ -817,7 +818,7 @@ class FlaxAlbertForMaskedLMModule(nn.Module):
        )
-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
+@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
 class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
    module_class = FlaxAlbertForMaskedLMModule

--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ALBERT model. """
+""" TF 2.0 ALBERT model."""
 import math
 from dataclasses import dataclass
@@ -657,12 +657,13 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
-            shape `(batch_size, sequence_length, hidden_size)`.
+            `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -677,13 +678,13 @@ class TFAlbertForPreTrainingOutput(ModelOutput):
 ALBERT_START_DOCSTRING = r"""
-    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    embeddings, pruning heads etc.)
+    etc.)
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    and behavior.
+    behavior.
    <Tip>
@@ -692,11 +693,11 @@ ALBERT_START_DOCSTRING = r"""
    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    the tensors in the first argument of the model call function: `model(inputs)`.
+    tensors in the first argument of the model call function: `model(inputs)`.
-    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
-    the first positional argument :
+    first positional argument :
    - a single Tensor with `input_ids` only and nothing else: `model(inputs_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
@@ -709,8 +710,7 @@ ALBERT_START_DOCSTRING = r"""
    Args:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            weights.
 """
 ALBERT_INPUTS_DOCSTRING = r"""
@@ -718,9 +718,8 @@ ALBERT_INPUTS_DOCSTRING = r"""
        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`AlbertTokenizer`]. See
+            Indices can be obtained using [`AlbertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
-            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            [`PreTrainedTokenizer.encode`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
@@ -731,14 +730,16 @@ ALBERT_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
            [What are position IDs?](../glossary#position-ids)
        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -748,9 +749,9 @@ ALBERT_INPUTS_DOCSTRING = r"""
            - 0 indicates the head is **masked**.
        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            This is useful if you want more control over how to convert `input_ids` indices into associated
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            vectors than the model's internal embedding lookup matrix.
+            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
@@ -760,8 +761,8 @@ ALBERT_INPUTS_DOCSTRING = r"""
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
-            argument can be used in eager mode, in graph mode the value will always be set to True.
+            in eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
@@ -982,7 +983,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
        return logits
-@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
+@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING)
 class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions.decoder.weight"]
@@ -1020,8 +1021,9 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
        inputs = input_processing(
            func=self.call,
@@ -1124,8 +1126,9 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        inputs = input_processing(
            func=self.call,
@@ -1334,12 +1337,12 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        inputs = input_processing(
            func=self.call,
@@ -1460,8 +1463,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
-            `input_ids` above)
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """
        inputs = input_processing(
            func=self.call,

--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -60,8 +60,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
    """
    Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    Users should refer to this superclass for more information regarding those methods.
+    this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
@@ -88,8 +88,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
            <Tip>
-            When building a sequence using special tokens, this is not the token that is used for the end of
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            sequence. The token used is the `sep_token`.
+            The token used is the `sep_token`.
            </Tip>
@@ -109,7 +109,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
@@ -324,8 +326,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]