Doc styler v2 (#14950)

* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring

Doc styler v2 (#14950)
* New doc styler * Fix issue with args at the start * Code sample fixes * Style code examples in MDX * Fix more patterns * Typo * Typo * More patterns * Do without black for now * Get more info in error * Docstring style * Re-enable check * Quality * Fix add_end_docstring decorator * Fix docstring
87e6e4fe · Sylvain Gugger · GitHub · c1138273 · 87e6e4fe · 87e6e4fe
Unverified Commit 87e6e4fe authored Dec 27, 2021 by Sylvain Gugger Committed by GitHub Dec 27, 2021
20 changed files
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -72,9 +72,10 @@ SPIECE_UNDERLINE = "▁"
 class AlbertTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
-    refer to this superclass for more information regarding those methods
+    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods
    Args:
        vocab_file (`str`):
@@ -213,8 +214,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
                Optional second list of IDs for sequence pairs.
        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-            sequence(s).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -28,8 +28,8 @@ logger = logging.get_logger(__name__)
 CLASS_DOCSTRING = """
    This is a generic model class that will be instantiated as one of the model classes of the library when created
-    with the [`~BaseAutoModelClass.from_pretrained`] class method or the
+    with the [`~BaseAutoModelClass.from_pretrained`] class method or the [`~BaseAutoModelClass.from_config`] class
-    [`~BaseAutoModelClass.from_config`] class method.
+    method.
    This class cannot be instantiated directly using `__init__()` (throws an error).
 """
@@ -39,8 +39,7 @@ FROM_CONFIG_DOCSTRING = """
        Note:
            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model
+            model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model weights.
-            weights.
        Args:
            config ([`PretrainedConfig`]):
@@ -62,8 +61,8 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
        Instantiate one of the model classes of the library from a pretrained model.
        The model class to instantiate is selected based on the `model_type` property of the config object (either
-        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
        List options
@@ -75,14 +74,14 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
-                      this case, `from_tf` should be set to `True` and a configuration object should be provided
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
-                      as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
-                      a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args (additional positional arguments, *optional*):
                Will be passed along to the underlying model `__init__()` method.
            config ([`PretrainedConfig`], *optional*):
@@ -91,16 +90,15 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      by supplying the save directory.
+                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            state_dict (*Dict[str, torch.Tensor]*, *optional*):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own
-                weights. In this case though, you should check if using
+                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
-                [`~PreTrainedModel.save_pretrained`] and
                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
@@ -115,7 +113,8 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
@@ -126,8 +125,8 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                identifier allowed by git.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                will execute code present on the Hub on your local machine.
+                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -137,10 +136,10 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      corresponds to a configuration attribute will be used to override said attribute with the
-                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      attribute will be passed to the underlying model's `__init__` function.
+                      will be passed to the underlying model's `__init__` function.
        Examples:
@@ -165,8 +164,8 @@ FROM_PRETRAINED_TF_DOCSTRING = """
        Instantiate one of the model classes of the library from a pretrained model.
        The model class to instantiate is selected based on the `model_type` property of the config object (either
-        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
        List options
@@ -175,15 +174,14 @@ FROM_PRETRAINED_TF_DOCSTRING = """
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      as `config` argument. This loading path is slower than converting the PyTorch model in a
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      TensorFlow model using the provided conversion scripts and loading the TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-                      afterwards.
            model_args (additional positional arguments, *optional*):
                Will be passed along to the underlying model `__init__()` method.
            config ([`PretrainedConfig`], *optional*):
@@ -192,8 +190,8 @@ FROM_PRETRAINED_TF_DOCSTRING = """
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      by supplying the save directory.
+                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            cache_dir (`str` or `os.PathLike`, *optional*):
@@ -209,7 +207,8 @@ FROM_PRETRAINED_TF_DOCSTRING = """
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
@@ -220,8 +219,8 @@ FROM_PRETRAINED_TF_DOCSTRING = """
                identifier allowed by git.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                will execute code present on the Hub on your local machine.
+                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -231,10 +230,10 @@ FROM_PRETRAINED_TF_DOCSTRING = """
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      corresponds to a configuration attribute will be used to override said attribute with the
-                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      attribute will be passed to the underlying model's `__init__` function.
+                      will be passed to the underlying model's `__init__` function.
        Examples:
@@ -259,8 +258,8 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
        Instantiate one of the model classes of the library from a pretrained model.
        The model class to instantiate is selected based on the `model_type` property of the config object (either
-        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
        List options
@@ -269,15 +268,14 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
                Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      as `config` argument. This loading path is slower than converting the PyTorch model in a
+                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      TensorFlow model using the provided conversion scripts and loading the TensorFlow model
+                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-                      afterwards.
            model_args (additional positional arguments, *optional*):
                Will be passed along to the underlying model `__init__()` method.
            config ([`PretrainedConfig`], *optional*):
@@ -286,8 +284,8 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      by supplying the save directory.
+                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            cache_dir (`str` or `os.PathLike`, *optional*):
@@ -303,7 +301,8 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
@@ -314,8 +313,8 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
                identifier allowed by git.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                will execute code present on the Hub on your local machine.
+                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
@@ -325,10 +324,10 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      corresponds to a configuration attribute will be used to override said attribute with the
-                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      attribute will be passed to the underlying model's `__init__` function.
+                      will be passed to the underlying model's `__init__` function.
        Examples:

--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Config class. """
+""" Auto Config class."""
 import importlib
 import re
 import warnings
@@ -501,9 +501,8 @@ class AutoConfig:
        r"""
        Instantiate one of the configuration classes of the library from a pretrained model configuration.
-        The configuration class to instantiate is selected based on the `model_type` property of the config object
+        The configuration class to instantiate is selected based on the `model_type` property of the config object that
-        that is loaded, or when it's missing, by falling back to using pattern matching on
+        is loaded, or when it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
-        `pretrained_model_name_or_path`:
        List options
@@ -515,8 +514,8 @@ class AutoConfig:
                      huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
                      namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing a configuration file saved using the
-                      [`~PretrainedConfig.save_pretrained`] method, or the
+                      [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method,
-                      [`~PreTrainedModel.save_pretrained`] method, e.g., `./my_model_directory/`.
+                      e.g., `./my_model_directory/`.
                    - A path or url to a saved configuration JSON *file*, e.g.,
                      `./my_model_directory/configuration.json`.
            cache_dir (`str` or `os.PathLike`, *optional*):
@@ -529,7 +528,8 @@ class AutoConfig:
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision(`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -537,13 +537,13 @@ class AutoConfig:
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                If `False`, then this function returns just the final configuration object.
-                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs*
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
-                the part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                will execute code present on the Hub on your local machine.
+                execute code present on the Hub on your local machine.
            kwargs(additional keyword arguments, *optional*):
                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled

--- a/src/transformers/models/auto/dynamic.py
+++ b/src/transformers/models/auto/dynamic.py
@@ -122,8 +122,8 @@ def get_class_from_dynamic_module(
    <Tip warning={true}>
-    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It
+    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
-    should therefore only be called on trusted repos.
+    therefore only be called on trusted repos.
    </Tip>
@@ -132,8 +132,8 @@ def get_class_from_dynamic_module(
            This can be either:
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
@@ -150,10 +150,11 @@ def get_class_from_dynamic_module(
        resume_download (`bool`, *optional*, defaults to `False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `transformers-cli login` (stored in `~/.huggingface`).
        revision(`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any

--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AutoFeatureExtractor class. """
+""" AutoFeatureExtractor class."""
 import importlib
 import os
 from collections import OrderedDict
@@ -82,9 +82,9 @@ class AutoFeatureExtractor:
        r"""
        Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
-        The feature extractor class to instantiate is selected based on the `model_type` property of the config
+        The feature extractor class to instantiate is selected based on the `model_type` property of the config object
-        object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
-        it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
        List options
@@ -110,19 +110,20 @@ class AutoFeatureExtractor:
                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `transformers-cli login` (stored in `~/.huggingface`).
            revision(`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final feature extractor object. If `True`,
+                If `False`, then this function returns just the final feature extractor object. If `True`, then this
-                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
+                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
+                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
-                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
            kwargs (`Dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are feature extractor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is

--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class. """
+""" Auto Model class."""
 import warnings
 from collections import OrderedDict

--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class. """
+""" Auto Model class."""
 from collections import OrderedDict

--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class. """
+""" Auto Model class."""
 import warnings

--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AutoProcessor class. """
+""" AutoProcessor class."""
 import importlib
 from collections import OrderedDict
@@ -79,8 +79,8 @@ class AutoProcessor:
        r"""
        Instantiate one of the processor classes of the library from a pretrained model vocabulary.
-        The processor class to instantiate is selected based on the `model_type` property of the config object
+        The processor class to instantiate is selected based on the `model_type` property of the config object (either
-        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible):
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible):
        List options
@@ -103,19 +103,20 @@ class AutoProcessor:
                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+                when running `transformers-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                If `False`, then this function returns just the final feature extractor object. If `True`,
+                If `False`, then this function returns just the final feature extractor object. If `True`, then this
-                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
+                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
-                dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
+                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
-                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
            kwargs (`Dict[str, Any]`, *optional*):
                The values in kwargs of any keys which are feature extractor attributes will be used to override the
                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is

--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Tokenizer class. """
+""" Auto Tokenizer class."""
 import importlib
 import json
@@ -279,8 +279,8 @@ def get_tokenizer_config(
            This can be either:
            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
-              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
@@ -293,10 +293,11 @@ def get_tokenizer_config(
        resume_download (`bool`, *optional*, defaults to `False`):
            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            when running `transformers-cli login` (stored in `~/.huggingface`).
        revision(`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -390,9 +391,9 @@ class AutoTokenizer:
        r"""
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
-        The tokenizer class to instantiate is selected based on the `model_type` property of the config object
+        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
-        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
-        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
+        falling back to using pattern matching on `pretrained_model_name_or_path`:
        List options
@@ -401,11 +402,10 @@ class AutoTokenizer:
                Can be either:
                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                      user or organization name, like `dbmdz/bert-base-german-cased`.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
-                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g.,
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-                      `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
@@ -423,7 +423,8 @@ class AutoTokenizer:
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision(`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
@@ -437,12 +438,12 @@ class AutoTokenizer:
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to `True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it will
-                will execute code present on the Hub on your local machine.
+                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
-                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`,
+                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
-                `mask_token`, `additional_special_tokens`. See parameters in the `__init__()` for more details.
+                `additional_special_tokens`. See parameters in the `__init__()` for more details.
        Examples:

--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BART model configuration """
+""" BART model configuration"""
 import warnings
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
@@ -35,19 +35,19 @@ BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BartConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`BartModel`]. It is used to
+    This is the configuration class to store the configuration of a [`BartModel`]. It is used to instantiate a BART
-    instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    configuration with the defaults will yield a similar configuration to that of the BART [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 50265):
            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`BartModel`] or
+            `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`].
-            [`TFBartModel`].
        d_model (`int`, *optional*, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
        encoder_layers (`int`, *optional*, defaults to 12):
@@ -63,8 +63,8 @@ class BartConfig(PretrainedConfig):
        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        dropout (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -79,11 +79,11 @@ class BartConfig(PretrainedConfig):
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            https://arxiv.org/abs/1909.11556) for more details.
+            for more details.
        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            https://arxiv.org/abs/1909.11556) for more details.
+            for more details.
        scale_embedding (`bool`, *optional*, defaults to `False`):
            Scale embeddings by diving by sqrt(d_model).
        use_cache (`bool`, *optional*, defaults to `True`):

--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BART model. """
+""" PyTorch BART model."""
 import copy
 import math
 import random
@@ -380,7 +380,8 @@ class BartDecoderLayer(nn.Module):
            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`torch.FloatTensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
@@ -517,13 +518,13 @@ class PretrainedBartModel(BartPretrainedModel):
 BART_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    pruning heads etc.)
+    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    general usage and behavior.
+    and behavior.
    Parameters:
        config ([`BartConfig`]):
@@ -537,29 +538,27 @@ BART_GENERATION_EXAMPLE = r"""
        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-        >>> # Generate Summary
+        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+        clean_up_tokenization_spaces=False) for g in summary_ids])
    Mask filling example::
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        carbs."
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        tokenizer([TXT], return_tensors='pt')['input_ids'] >>> logits = model(input_ids).logits
-        >>> logits = model(input_ids).logits
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() >>> probs = logits[0,
-        >>> probs = logits[0, masked_index].softmax(dim=0)
+        masked_index].softmax(dim=0) >>> values, predictions = probs.topk(5)
-        >>> values, predictions = probs.topk(5)
        >>> tokenizer.decode(predictions).split()
 """
@@ -570,9 +569,8 @@ BART_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -585,26 +583,24 @@ BART_INPUTS_DOCSTRING = r"""
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are decoder input IDs?](../glossary#decoder-input-ids)
-            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
-            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
-            `past_key_values`).
            For translation and summarization training, `decoder_input_ids` should be provided. If no
-            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
-            the right for denoising pre-training following the paper.
+            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            also be used by default.
+            be used by default.
            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_inputs`] and
-            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information
-            information on the default strategy.
+            on the default strategy.
        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
@@ -618,39 +614,42 @@ BART_INPUTS_DOCSTRING = r"""
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
-            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
-            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-            cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
+            ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
-            vectors than the model's internal embedding lookup matrix.
+            shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids`
+            you can choose to directly pass an embedded representation. This is useful if you want more control over
+            how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup
+            matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            takes the value of `inputs_embeds`.
+            of `inputs_embeds`.
        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            decoding (see `past_key_values`).
+            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -721,9 +720,8 @@ class BartEncoder(BartPretrainedModel):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
-                Indices can be obtained using [`BartTokenizer`]. See
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                [`PreTrainedTokenizer.__call__`] for details.
-                for details.
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -740,9 +738,9 @@ class BartEncoder(BartPretrainedModel):
                - 0 indicates the head is **masked**.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                representation. This is useful if you want more control over how to convert `input_ids` indices
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                into associated vectors than the model's internal embedding lookup matrix.
+                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
@@ -916,9 +914,8 @@ class BartDecoder(BartPretrainedModel):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
-                Indices can be obtained using [`BartTokenizer`]. See
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                [`PreTrainedTokenizer.__call__`] for details.
-                for details.
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -953,19 +950,20 @@ class BartDecoder(BartPretrainedModel):
                - 0 indicates the head is **masked**.
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-                decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                If `past_key_values` are used, the user can optionally input only the last
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                `decoder_input_ids` (those that don't have their past key value states given to this model) of
+                all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor`
-                shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+                of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
-                sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
-                into associated vectors than the model's internal embedding lookup matrix.
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
@@ -1304,7 +1302,8 @@ class BartForConditionalGeneration(BartPretrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Returns:
@@ -1446,7 +1445,8 @@ class BartForSequenceClassification(BartPretrainedModel):
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
@@ -1712,9 +1712,8 @@ class BartForCausalLM(BartPretrainedModel):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
-                Indices can be obtained using [`BartTokenizer`]. See
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                [`PreTrainedTokenizer.__call__`] for details.
-                for details.
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1743,25 +1742,24 @@ class BartForCausalLM(BartPretrainedModel):
                - 0 indicates the head is **masked**.
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
-                additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
-                model.
                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-                decoding.
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                decoding (see `past_key_values`).
+                (see `past_key_values`).
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Bart model. """
+""" Flax Bart model."""
 import math
 import random
@@ -59,12 +59,13 @@ _TOKENIZER_FOR_DOC = "BartTokenizer"
 BART_START_DOCSTRING = r"""
-    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    embeddings, pruning heads etc.)
+    etc.)
-    This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax
+    This model is also a Flax Linen
-    Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
    Finally, this model supports inherent JAX features such as:
@@ -76,11 +77,10 @@ BART_START_DOCSTRING = r"""
    Parameters:
        config ([`BartConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
-            model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
-            GPUs) and `jax.numpy.bfloat16` (on TPUs).
+            `jax.numpy.bfloat16` (on TPUs).
            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.
@@ -88,8 +88,8 @@ BART_START_DOCSTRING = r"""
            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**
-            If you wish to change the dtype of the model parameters, see
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
-            [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`].
+            [`~FlaxPreTrainedModel.to_bf16`].
 """
 BART_INPUTS_DOCSTRING = r"""
@@ -98,9 +98,8 @@ BART_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
@@ -113,22 +112,23 @@ BART_INPUTS_DOCSTRING = r"""
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are decoder input IDs?](../glossary#decoder-input-ids)
            For translation and summarization training, `decoder_input_ids` should be provided. If no
-            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
-            the right for denoising pre-training following the paper.
+            for denoising pre-training following the paper.
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            also be used by default.
+            be used by default.
-            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.max_position_embeddings - 1]`.
@@ -149,9 +149,8 @@ BART_ENCODE_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
@@ -162,7 +161,8 @@ BART_ENCODE_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
@@ -178,20 +178,18 @@ BART_DECODE_INPUTS_DOCSTRING = r"""
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
            Indices of decoder input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are decoder input IDs?](../glossary#decoder-input-ids)
            For translation and summarization training, `decoder_input_ids` should be provided. If no
-            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
-            the right for denoising pre-training following the paper.
+            for denoising pre-training following the paper.
        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
-            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
-            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-            cross-attention of the decoder.
        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
@@ -200,10 +198,11 @@ BART_DECODE_INPUTS_DOCSTRING = r"""
            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            also be used by default.
+            be used by default.
-            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.max_position_embeddings - 1]`.
@@ -968,9 +967,10 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
-                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`,
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
-                *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
-                encoder. Used in the cross-attention of the decoder.
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
        """
        # init input variables to retrieve cache
        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
@@ -1510,29 +1510,26 @@ FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING = """
        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='jax')
+        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='jax')
-        >>> # Generate Summary
+        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids']).sequences >>>
-        >>> summary_ids = model.generate(inputs['input_ids']).sequences
+        print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
-        >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
    Mask filling example::
-        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        carbs."
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        >>> input_ids = tokenizer([TXT], return_tensors='jax')['input_ids']
+        tokenizer([TXT], return_tensors='jax')['input_ids'] >>> logits = model(input_ids).logits
-        >>> logits = model(input_ids).logits
-        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item()
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() >>> probs =
-        >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+        jax.nn.softmax(logits[0, masked_index], axis=0) >>> values, predictions = jax.lax.top_k(probs)
-        >>> values, predictions = jax.lax.top_k(probs)
        >>> tokenizer.decode(predictions).split()
 """

--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Bart model. """
+""" TF 2.0 Bart model."""
 import random
@@ -375,7 +375,8 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
            hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`tf.Tensor`): cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
+            encoder_hidden_states (`tf.Tensor`):
+                cross attention input to the layer of shape *(seq_len, batch, embed_dim)*
            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
@@ -472,13 +473,13 @@ class TFBartPretrainedModel(TFPreTrainedModel):
 BART_START_DOCSTRING = r"""
-    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
-    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    embeddings, pruning heads etc.)
+    etc.)
-    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
-    and behavior.
+    behavior.
    <Tip>
@@ -487,11 +488,11 @@ BART_START_DOCSTRING = r"""
    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.
-    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all
+    This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
-    the tensors in the first argument of the model call function: `model(inputs)`.
+    tensors in the first argument of the model call function: `model(inputs)`.
-    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+    If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
-    the first positional argument :
+    first positional argument :
    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
@@ -504,8 +505,7 @@ BART_START_DOCSTRING = r"""
    Args:
        config ([`BartConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
-            model weights.
 """
@@ -514,27 +514,25 @@ BART_GENERATION_EXAMPLE = r"""
        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration, BartConfig
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        BartTokenizer.from_pretrained('facebook/bart-large')
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." >>> inputs =
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
+        tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
-        >>> # Generate Summary
+        >>> # Generate Summary >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5,
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        early_stopping=True) >>> print([tokenizer.decode(g, skip_special_tokens=True,
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+        clean_up_tokenization_spaces=False) for g in summary_ids])
    Mask filling example::
-        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration
+        >>> from transformers import BartTokenizer, TFBartForConditionalGeneration >>> tokenizer =
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        BartTokenizer.from_pretrained('facebook/bart-large') >>> TXT = "My friends are <mask> but they eat too many
-        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        carbs."
-        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large') >>> input_ids =
-        >>> input_ids = tokenizer([TXT], return_tensors='tf')['input_ids']
+        tokenizer([TXT], return_tensors='tf')['input_ids'] >>> logits = model(input_ids).logits >>> probs =
-        >>> logits = model(input_ids).logits
+        tf.nn.softmax(logits[0]) >>> # probs[5] is associated with the mask token
-        >>> probs = tf.nn.softmax(logits[0])
-        >>> # probs[5] is associated with the mask token
 """
@@ -543,9 +541,8 @@ BART_INPUTS_DOCSTRING = r"""
        input_ids (`tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BertTokenizer`]. See
+            Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
@@ -558,19 +555,17 @@ BART_INPUTS_DOCSTRING = r"""
        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.
-            Indices can be obtained using [`BartTokenizer`]. See
+            Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            [`PreTrainedTokenizer.__call__`] for details.
-            details.
            [What are decoder input IDs?](../glossary#decoder-input-ids)
-            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
-            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
-            `past_key_values`).
            For translation and summarization training, `decoder_input_ids` should be provided. If no
-            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
-            the right for denoising pre-training following the paper.
+            for denoising pre-training following the paper.
        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
@@ -596,12 +591,12 @@ BART_INPUTS_DOCSTRING = r"""
            of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            decoding (see `past_key_values`). Set to `False` during training, `True` during generation
+            `past_key_values`). Set to `False` during training, `True` during generation
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
@@ -611,8 +606,8 @@ BART_INPUTS_DOCSTRING = r"""
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
-            argument can be used in eager mode, in graph mode the value will always be set to True.
+            in eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
@@ -672,9 +667,8 @@ class TFBartEncoder(tf.keras.layers.Layer):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
-                Indices can be obtained using [`BartTokenizer`]. See
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                [`PreTrainedTokenizer.__call__`] for details.
-                for details.
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -684,15 +678,16 @@ class TFBartEncoder(tf.keras.layers.Layer):
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                representation. This is useful if you want more control over how to convert `input_ids` indices
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                into associated vectors than the model's internal embedding lookup matrix.
+                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
@@ -839,9 +834,8 @@ class TFBartDecoder(tf.keras.layers.Layer):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.
-                Indices can be obtained using [`BartTokenizer`]. See
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                [`PreTrainedTokenizer.__call__`] for details.
-                for details.
                [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -878,11 +872,13 @@ class TFBartDecoder(tf.keras.layers.Layer):
                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
                decoding.
-                If `past_key_values` are used, the user can optionally input only the last
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                `decoder_input_ids` (those that don't have their past key value states given to this model) of
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+                all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`tf.Tensor` of
-                sequence_length)`. inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
-                into associated vectors than the model's internal embedding lookup matrix.
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
@@ -1347,7 +1343,8 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
    ):
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Returns:

--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -56,9 +56,8 @@ class BartTokenizer(RobertaTokenizer):
    r"""
    Construct a BART tokenizer.
-    [`BartTokenizer`] is identical to [`RobertaTokenizer`]. Refer to superclass
+    [`BartTokenizer`] is identical to [`RobertaTokenizer`]. Refer to superclass [`RobertaTokenizer`] for usage examples
-    [`RobertaTokenizer`] for usage examples and documentation concerning the initialization
+    and documentation concerning the initialization parameters and other methods.
-    parameters and other methods.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -65,9 +65,8 @@ class BartTokenizerFast(RobertaTokenizerFast):
    r"""
    Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library).
-    [`BartTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer to
+    [`BartTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer to superclass [`RobertaTokenizerFast`] for
-    superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning the
+    usage examples and documentation concerning the initialization parameters and other methods.
-    initialization parameters and other methods.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -48,11 +48,11 @@ SPIECE_UNDERLINE = "▁"
 class BarthezTokenizer(PreTrainedTokenizer):
    """
-    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a
+    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a BARThez tokenizer. Based on
-    BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    [SentencePiece](https://github.com/google/sentencepiece).
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    Users should refer to this superclass for more information regarding those methods.
+    this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
@@ -73,8 +73,8 @@ class BarthezTokenizer(PreTrainedTokenizer):
            <Tip>
-            When building a sequence using special tokens, this is not the token that is used for the end of
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            sequence. The token used is the `sep_token`.
+            The token used is the `sep_token`.
            </Tip>
@@ -96,7 +96,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -58,11 +58,11 @@ SPIECE_UNDERLINE = "▁"
 class BarthezTokenizerFast(PreTrainedTokenizerFast):
    """
-    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast"
+    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast" BARThez tokenizer. Based on
-    BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    [SentencePiece](https://github.com/google/sentencepiece).
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    methods. Users should refer to this superclass for more information regarding those methods.
+    refer to this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
@@ -83,8 +83,8 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
            <Tip>
-            When building a sequence using special tokens, this is not the token that is used for the end of
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            sequence. The token used is the `sep_token`.
+            The token used is the `sep_token`.
            </Tip>

--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -47,8 +47,8 @@ class BartphoTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    Users should refer to this superclass for more information regarding those methods.
+    this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
@@ -72,8 +72,8 @@ class BartphoTokenizer(PreTrainedTokenizer):
            <Tip>
-            When building a sequence using special tokens, this is not the token that is used for the end of
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            sequence. The token used is the `sep_token`.
+            The token used is the `sep_token`.
            </Tip>
@@ -95,7 +95,9 @@ class BartphoTokenizer(PreTrainedTokenizer):
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BEiT model configuration """
+""" BEiT model configuration"""
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -28,11 +28,10 @@ BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BeitConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a [`BeitModel`]. It is used to
+    This is the configuration class to store the configuration of a [`BeitModel`]. It is used to instantiate an BEiT
-    instantiate an BEiT model according to the specified arguments, defining the model architecture. Instantiating a
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    configuration with the defaults will yield a similar configuration to that of the BEiT
+    defaults will yield a similar configuration to that of the BEiT
-    [microsoft/beit-base-patch16-224-in22k](https://huggingface.co/microsoft/beit-base-patch16-224-in22k)
+    [microsoft/beit-base-patch16-224-in22k](https://huggingface.co/microsoft/beit-base-patch16-224-in22k) architecture.
-    architecture.
    Args:
        vocab_size (`int`, *optional*, defaults to 8092):
@@ -47,8 +46,8 @@ class BeitConfig(PretrainedConfig):
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):