Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes

Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
969859d5 · Santiago Castro · GitHub · 4731a00c · 969859d5 · 969859d5
Unverified Commit 969859d5 authored Oct 29, 2020 by Santiago Castro Committed by GitHub Oct 29, 2020
20 changed files
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -25,14 +25,14 @@ logger = logging.get_logger(__name__)
 class TFGenerationMixin:
    """
-    A class contraining all of the functions supporting generation, to be used as a mixin in
+    A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transfomers.TFPreTrainedModel`.
+    :class:`~transformers.TFPreTrainedModel`.
    """
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        """
-        Implement in subclasses of :class:`~transfomers.TFPreTrainedModel` for custom behavior to prepare inputs in the
+        Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
-        generate method.
+        the generate method.
        """
        return {"inputs": inputs}
@@ -216,17 +216,17 @@ class TFGenerationMixin:
        )
        if input_ids is not None:
-            batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
+            batch_size = shape_list(input_ids)[0]  # overridden by the input batch_size
        else:
            batch_size = 1
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
        assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
+        assert temperature > 0, "`temperature` should be strictly positive."
        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
@@ -239,10 +239,10 @@ class TFGenerationMixin:
        assert (eos_token_id is None) or (
            isinstance(eos_token_id, int) and (eos_token_id >= 0)
        ), "`eos_token_id` should be a positive integer."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
        assert (
            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
+        ), "`num_return_sequences` should be a strictly positive integer."
        assert (
            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
@@ -722,7 +722,7 @@ class TFGenerationMixin:
                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
                )  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                # re-organize to group the beam together (we are keeping top hypothesis across beams)
                next_scores = tf.reshape(
                    next_scores, (batch_size, num_beams * vocab_size)
                )  # (batch_size, num_beams * vocab_size)
@@ -897,7 +897,7 @@ class TFGenerationMixin:
    def adjust_logits_during_generation(self, logits, **kwargs):
        """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
        the generate method.
        """
        return logits
@@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
    """
-    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
    Args:
        logits: logits distribution shape (batch size, vocabulary size)
@@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
 def sample_without_replacement(logits, num_samples):
    """
-    categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
+    categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
    https://github.com/tensorflow/tensorflow/issues/9260 for more info
    """
    z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))

--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -29,20 +29,20 @@ logger = logging.get_logger(__name__)
 class GenerationMixin:
    """
-    A class contraining all of the functions supporting generation, to be used as a mixin in
+    A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transfomers.PreTrainedModel`.
+    :class:`~transformers.PreTrainedModel`.
    """
    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
        generate method.
        """
        return {"input_ids": input_ids}
    def adjust_logits_during_generation(self, logits, **kwargs):
        """
-        Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
        the generate method.
        """
        return logits
@@ -285,7 +285,7 @@ class GenerationMixin:
        )
        if input_ids is not None:
-            batch_size = input_ids.shape[0]  # overriden by the input batch_size
+            batch_size = input_ids.shape[0]  # overridden by the input batch_size
        else:
            batch_size = 1
@@ -533,7 +533,7 @@ class GenerationMixin:
    ):
        """
        Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
-        independantly.
+        independently.
        """
        # length of generated sentences / unfinished sentences
        unfinished_sents = input_ids.new(batch_size).fill_(1)
@@ -600,7 +600,7 @@ class GenerationMixin:
                # unfinished_sents is set to zero if eos in sentence
                unfinished_sents.mul_((~eos_in_sents).long())
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
            if unfinished_sents.max() == 0:
                break
@@ -724,7 +724,7 @@ class GenerationMixin:
            else:
                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                # re-organize to group the beam together (we are keeping top hypothesis across beams)
                next_scores = next_scores.view(
                    batch_size, num_beams * vocab_size
                )  # (batch_size, num_beams * vocab_size)
@@ -969,7 +969,7 @@ def top_k_top_p_filtering(
    min_tokens_to_keep: int = 1,
 ) -> Tensor:
    """
-    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
    Args:
        logits: logits distribution shape (batch size, vocabulary size)

--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -49,7 +49,7 @@ class ModelCard:
    """
    def __init__(self, **kwargs):
-        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        # Recommended attributes from https://arxiv.org/abs/1810.03993 (see papers)
        self.model_details = kwargs.pop("model_details", {})
        self.intended_use = kwargs.pop("intended_use", {})
        self.factors = kwargs.pop("factors", {})

--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
            model_args (additional positional arguments, `optional`):
                Will be passed along to the underlying model ``__init__()`` method.
            config (:class:`~transformers.PretrainedConfig`, `optional`):
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:
                    - The model is a model provided by the library (loaded with the `shortcut name` string of a
@@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
+                Whether or not to only look at local files (e.g., not try downloading the model).
            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
                our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
@@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification:
 class AutoModelForMultipleChoice:
    r"""
    This is a generic model class that will be instantiated as one of the model classes of the library---with a
-    multiple choice classifcation head---when created with the when created with the
+    multiple choice classification head---when created with the when created with the
    :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
    :meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel):
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],

--- a/src/transformers/modeling_deberta.py
+++ b/src/transformers/modeling_deberta.py
@@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function):
    Args:
      input (:obj:`torch.tensor`): The input tensor that will apply softmax.
-      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
+      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
-      dim (int): The dimenssion that will apply softmax
+      dim (int): The dimension that will apply softmax
    Example::
      import torch
@@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module):
        Args:
            hidden_states (:obj:`torch.FloatTensor`):
-                Input states to the module usally the output from previous layer, it will be the Q,K and V in
+                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                `Attention(Q,K,V)`
            attention_mask (:obj:`torch.ByteTensor`):
-                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium
+                An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
                sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
                th token.
            return_att (:obj:`bool`, optional):
-                Whether return the attention maxitrix.
+                Whether return the attention matrix.
            query_states (:obj:`torch.FloatTensor`, optional):
                The `Q` state in `Attention(Q,K,V)`.
@@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module):
            relative_pos = relative_pos.unsqueeze(1)
        # bxhxqxk
        elif relative_pos.dim() != 4:
-            raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
        att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
        relative_pos = relative_pos.long().to(query_layer.device)
@@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
    The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
    <https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
    BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
-    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data.
+    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to

--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -290,7 +290,7 @@ class Transformer(nn.Module):
            attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
        Returns:
-            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top)
+            hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
            layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
                Tuple of length n_layers with the hidden states from each layer.
                Optional: only if output_hidden_states=True

--- a/src/transformers/modeling_dpr.py
+++ b/src/transformers/modeling_dpr.py
@@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r"""
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to rturn the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.

--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "EncoderDecoderConfig"
 ENCODER_DECODER_START_DOCSTRING = r"""
-    This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the
+    This class can be used to initialize a sequence-tsequencece model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added

--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
            `What are position IDs? <../glossary.html#position-ids>`_
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices
+            also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
            selected in ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
            Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the

--- a/src/transformers/modeling_flax_auto.py
+++ b/src/transformers/modeling_flax_auto.py
@@ -124,18 +124,18 @@ class FlaxAutoModel(object):
                All remaining positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
                  pretrained model), or
                - the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
-                  by suppling the save directory.
+                  by supplying the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
+                - the model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
                  configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved
+                an optional state dictionary for the model to use instead of a state dictionary loaded from saved
                weights file. This option can be used if you want to create a model from a pretrained configuration but
                load your own weights. In this case though, you should check if using
                :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
@@ -150,14 +150,14 @@ class FlaxAutoModel(object):
                they exists.
            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error
                messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:

--- a/src/transformers/modeling_flax_roberta.py
+++ b/src/transformers/modeling_flax_roberta.py
@@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **maked**.
+            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):

--- a/src/transformers/modeling_funnel.py
+++ b/src/transformers/modeling_funnel.py
@@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module):
        d_model = self.config.d_model
        if self.config.attention_type == "factorized":
            # Notations from the paper, appending A.2.2, final formula.
-            # We need to create and return the matrics phi, psi, pi and omega.
+            # We need to create and return the matrices phi, psi, pi and omega.
            pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
            freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
            inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
@@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
 @add_start_docstrings(
    """
-    Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the
+    Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
    first timestep of the last hidden state) e.g. for GLUE tasks.
    """,
    FUNNEL_START_DOCSTRING,

--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel):
            attention_mask = (1.0 - attention_mask) * -10000.0
        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.add_cross_attention and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
@@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
-            # create postion_ids on the fly for batch generation
+            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past:
@@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )
        pooled_logits = logits[range(batch_size), sequence_lengths]

--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module):
                # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
                # which is the attention weights from tokens with global attention to all tokens
                # It doesn't not return local attention
-                # In case of variable number of global attantion in the rows of a batch,
+                # In case of variable number of global attention in the rows of a batch,
                # attn_probs are padded with -10000.0 attention scores
                attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
            else:
@@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module):
                                       -0.7584,  0.4206, -0.0405,  0.1599,
                                       2.0514, -1.1600,  0.5372,  0.2629 ]
              window_overlap = num_rows = 4
-             (pad & diagonilize) =>
+             (pad & diagonalize) =>
             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
@@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module):
    @staticmethod
    def _chunk(hidden_states, window_overlap):
-        """convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
        # non-overlapping chunks of size = 2w
        hidden_states = hidden_states.view(
@@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module):
        chunked_query = self._chunk(query, window_overlap)
        chunked_key = self._chunk(key, window_overlap)
-        # matrix multipication
+        # matrix multiplication
        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
        # bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
@@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
            `What are attention masks? <../glossary.html#attention-mask>`__
        global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
+            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
            attention attends to all other tokens, and all other tokens attend to them. This is important for
            task-specific finetuning because it makes the model more flexible at representing the task. For example,
            for classification, the <s> token should be given global attention. For QA, all question tokens should also

--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
@@ -58,7 +58,7 @@ class GeLU(nn.Module):
 @dataclass
 class LxmertModelOutput(ModelOutput):
    """
-    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
+    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
    visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
    encoder")
@@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module):
        self.output = LxmertAttentionOutput(config)
    def forward(self, input_tensor, attention_mask, output_attentions=False):
-        # Self attention attends to itself, thus keys and querys are the same (input_tensor).
+        # Self attention attends to itself, thus keys and queries are the same (input_tensor).
        output = self.self(
            input_tensor,
            input_tensor,
@@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r"""
    <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
    pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
    using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
-    question answering attribute prediction, and object tag predicition.
+    question answering attribute prediction, and object tag prediction.
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
        will add newly initialized weights. Reducing the size will remove weights from the end
        Args:
-            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
-                Old linear layer to be resized.
            num_labels (:obj:`int`, `optional`):
                New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
                weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
                anything.
        Return:
@@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
        will add newly initialized weights. Reducing the size will remove weights from the end
        Args:
-            cur_qa_logit_layer (:obj:`torch.nn.Linear`):
-                Old linear layer to be resized.
            num_labels (:obj:`int`, `optional`):
                New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
                weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
-                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
+                just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
                anything.
        Return:

--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
        )
        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)

--- a/src/transformers/modeling_rag.py
+++ b/src/transformers/modeling_rag.py
@@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput):
    Args:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Languaged modeling loss.
+            Language modeling loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
            each vocabulary token.
@@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
            Used by the (:class:`~transformers.RagModel`) model during decoding.
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for generation tasks. `None` by default, constuct as per instructions for the generator model
+            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
            you're using with your RAG instance.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size,  target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
@@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
            :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
            decoding.
        doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
-            Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
+            Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
            :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
            :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
            :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
@@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel):
 @add_start_docstrings_to_model_forward(
    """
-    A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass.
+    A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
    """,
    RAG_START_DOCSTRING,
 )
@@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
            >>> input_ids = input_dict["input_ids"]
            >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
-            >>> # or use retriever seperately
+            >>> # or use retriever separately
            >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
            >>> # 1. Encode
            >>> question_hidden_states = model.question_encoder(input_ids)[0]
@@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel):
        )  # batch_size x n_docs x tgt_len x dim
        doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
-        # RAG-sequence marginaliation
+        # RAG-sequence marginalization
        first_token_scores = seq_logprobs[:, :, :1, :]
        second_token_scores = seq_logprobs[:, :, 1:2, :]
        remainder = seq_logprobs[:, :, 2:, :]
        rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
-        # calcualate loss
+        # calculate loss
        target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
        assert target.dim() == rag_logprobs.dim()
@@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
 @add_start_docstrings_to_model_forward(
    """
-    A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass.
+    A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
    """,
    RAG_START_DOCSTRING,
 )
@@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
            >>> input_ids = input_dict["input_ids"]
            >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
-            >>> # or use retriever seperately
+            >>> # or use retriever separately
            >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
            >>> # 1. Encode
            >>> question_hidden_states = model.question_encoder(input_ids)[0]
@@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
                to the forward pass. :obj:`context_input_ids` are returned by
                :meth:`~transformers.RagRetriever.__call__`.
            doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
-                Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
+                Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
                :obj:`question_encoder_last_hidden_state`.
                If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided

--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
 class ReverseSort(Function):
    """
    After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
-    backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here.
+    backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
    """
    @staticmethod
@@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel):
                device=device,
            )
-        # start index for postion encoding depends on incremental decoding
+        # start index for position encoding depends on incremental decoding
        if past_buckets_states is not None:
            start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
        else:

--- a/src/transformers/modeling_retribert.py
+++ b/src/transformers/modeling_retribert.py
@@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r"""
 @add_start_docstrings(
-    """Bert Based model to embed queries or document for document retreival. """,
+    """Bert Based model to embed queries or document for document retrieval. """,
    RETRIBERT_START_DOCSTRING,
 )
 class RetriBertModel(RetriBertPreTrainedModel):
@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
                attention_mask, input_shape, device
            )
-            # define function for cehckpointing
+            # define function for checkpointing
            def partial_encode(*inputs):
                encoder_outputs = sent_encoder.encoder(
                    inputs[0],
@@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
        Return:
            :obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
-            its corresponding document and each cocument to its corresponding query in the batch
+            its corresponding document and each document to its corresponding query in the batch
        """
        device = input_ids_query.device
        q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)