Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes

Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
969859d5 · Santiago Castro · GitHub · 4731a00c · 969859d5 · 969859d5
Unverified Commit 969859d5 authored Oct 29, 2020 by Santiago Castro Committed by GitHub Oct 29, 2020
20 changed files
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -44,7 +44,7 @@ _CONFIG_FOR_DOC = "T5Config"
 _TOKENIZER_FOR_DOC = "T5Tokenizer"
 ####################################################
-# This dict contrains shortcut names and associated url
+# This dict contains shortcut names and associated url
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -156,7 +156,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 class T5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
-        Construct a layernorm module in the T5 style No bias and no substraction of mean.
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -256,7 +256,7 @@ class T5Attention(nn.Module):
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
-            max_distance: an intege
+            max_distance: an integer
        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
@@ -705,7 +705,7 @@ class T5Stack(T5PreTrainedModel):
            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
        if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)
        batch_size, seq_length = input_shape

--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -739,7 +739,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
    ALBERT_START_DOCSTRING,
 )
 class TFAlbertModel(TFAlbertPreTrainedModel):

--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -364,14 +364,14 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
            model_args (additional positional arguments, `optional`):
                Will be passed along to the underlying model ``__init__()`` method.
            config (:class:`~transformers.PretrainedConfig`, `optional`):
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:
                    - The model is a model provided by the library (loaded with the `shortcut name` string of a
                      pretrained model).
                    - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
-                      by suppling the save directory.
+                      by suppyling the save directory.
-                    - The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
+                    - The model is loaded by suppyling a local directory as ``pretrained_model_name_or_path`` and a
                      configuration JSON file named `config.json` is found in the directory.
            state_dict (`Dict[str, torch.Tensor]`, `optional`):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.
@@ -398,7 +398,7 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
+                Whether or not to only look at local files (e.g., not try downloading the model).
            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
                our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
@@ -815,7 +815,7 @@ class TFAutoModelForMaskedLM:
    This is a generic model class that will be instantiated as one of the model classes of the library---with a masked
    language modeling head---when created with the when created with the
    :meth:`~transformers.TFAutoModelForMaskedLM.from_pretrained` class method or the
-    :meth:`~transformers.TFAutoModelForMasedLM.from_config` class method.
+    :meth:`~transformers.TFAutoModelForMaskedLM.from_config` class method.
    This class cannot be instantiated directly using ``__init__()`` (throws an error).
    """
@@ -1297,7 +1297,7 @@ class TFAutoModelForTokenClassification:
 class TFAutoModelForMultipleChoice:
    r"""
    This is a generic model class that will be instantiated as one of the model classes of the library---with a
-    multiple choice classifcation head---when created with the when created with the
+    multiple choice classification head---when created with the when created with the
    :meth:`~transformers.TFAutoModelForMultipleChoice.from_pretrained` class method or the
    :meth:`~transformers.TFAutoModelForMultipleChoice.from_config` class method.

--- a/src/transformers/modeling_tf_bart.py
+++ b/src/transformers/modeling_tf_bart.py
@@ -332,7 +332,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
                - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
                - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch,
-                  embed_dim)`. Only populated if *return_all_hiddens* is True.
+                  embed_dim)`. Only populated if *output_hidden_states* is True.
                - **all_attentions** (List[Tensor]): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """

--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -784,7 +784,7 @@ BERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
    BERT_START_DOCSTRING,
 )
 class TFBertModel(TFBertPreTrainedModel):

--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -346,7 +346,7 @@ class TFTransformer(tf.keras.layers.Layer):
        Returns:
            hidden_state: tf.Tensor(bs, seq_length, dim)
-                Sequence of hiddens states in the last (top) layer
+                Sequence of hidden states in the last (top) layer
            all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
                Tuple of length n_layers with the hidden states from each layer.
                Optional: only if output_hidden_states=True
@@ -552,7 +552,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
-        iinputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+        inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
@@ -571,7 +571,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+    "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
    DISTILBERT_START_DOCSTRING,
 )
 class TFDistilBertModel(TFDistilBertPreTrainedModel):

--- a/src/transformers/modeling_tf_flaubert.py
+++ b/src/transformers/modeling_tf_flaubert.py
@@ -109,7 +109,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the `language name
-            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
+            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
            `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
            See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
@@ -128,7 +128,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
            `What are position IDs? <../glossary.html#position-ids>`__
        lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility Indices selected in
            ``[0, ..., input_ids.size(-1)]``:
        cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
            Dictionary string to ``tf.FloatTensor`` that contains precomputed hidden states (key and values in the
@@ -214,7 +214,7 @@ class TFFlaubertPreTrainedModel(TFPreTrainedModel):
 @add_start_docstrings(
-    "The bare Flaubert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
    FLAUBERT_START_DOCSTRING,
 )
 class TFFlaubertModel(TFFlaubertPreTrainedModel):

--- a/src/transformers/modeling_tf_funnel.py
+++ b/src/transformers/modeling_tf_funnel.py
@@ -178,7 +178,7 @@ class TFFunnelAttentionStructure:
        self.sin_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
        self.cos_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
        # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
-        # dividide.
+        # divided.
        self.pooling_mult = None
    def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
@@ -219,7 +219,7 @@ class TFFunnelAttentionStructure:
        """
        if self.attention_type == "factorized":
            # Notations from the paper, appending A.2.2, final formula.
-            # We need to create and return the matrics phi, psi, pi and omega.
+            # We need to create and return the matrices phi, psi, pi and omega.
            pos_seq = tf.range(0, seq_len, 1.0, dtype=dtype)
            freq_seq = tf.range(0, self.d_model // 2, 1.0, dtype=dtype)
            inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2)))

--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -549,7 +549,7 @@ GPT2_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
    GPT2_START_DOCSTRING,
 )
 class TFGPT2Model(TFGPT2PreTrainedModel):

--- a/src/transformers/modeling_tf_longformer.py
+++ b/src/transformers/modeling_tf_longformer.py
@@ -172,9 +172,9 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
        Returns: tf.Tensor
        """
        mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
-        incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
+        incremental_indices = tf.math.cumsum(mask, axis=1) * mask
-        return incremental_indicies + self.padding_idx
+        return incremental_indices + self.padding_idx
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
@@ -560,7 +560,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
        # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
        # which is the attention weights from tokens with global attention to all tokens
        # It doesn't not return local attention
-        # In case of variable number of global attantion in the rows of a batch,
+        # In case of variable number of global attention in the rows of a batch,
        # attn_probs are padded with -10000.0 attention scores
        # LOCAL ATTN:
        # without global attention, return local attention probabilities
@@ -618,7 +618,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
        chunked_query = self._chunk(query, window_overlap)
        chunked_key = self._chunk(key, window_overlap)
-        # matrix multipication
+        # matrix multiplication
        # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
        # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
        # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
@@ -826,7 +826,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
                                       -0.7584,  0.4206, -0.0405,  0.1599,
                                       2.0514, -1.1600,  0.5372,  0.2629 ]
              window_overlap = num_rows = 4
-             (pad & diagonilize) =>
+             (pad & diagonalize) =>
             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
@@ -853,7 +853,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
    @staticmethod
    def _chunk(hidden_states, window_overlap):
-        """convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
+        """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
        batch_size, seq_length, hidden_dim = shape_list(hidden_states)
        num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1
@@ -1557,7 +1557,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
            `What are attention masks? <../glossary.html#attention-mask>`__
        global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
-            Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
+            Mask to decide the attention given on each token, local attention or global attention. Tokens with global
            attention attends to all other tokens, and all other tokens attend to them. This is important for
            task-specific finetuning because it makes the model more flexible at representing the task. For example,
            for classification, the <s> token should be given global attention. For QA, all question tokens should also

--- a/src/transformers/modeling_tf_lxmert.py
+++ b/src/transformers/modeling_tf_lxmert.py
@@ -50,7 +50,7 @@ TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
 @dataclass
 class TFLxmertModelOutput(ModelOutput):
    """
-    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
+    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
    visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
    encoder")
@@ -423,7 +423,7 @@ class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer):
        self.attention_output = TFLxmertAttentionOutput(config, name="output")
    def call(self, input_tensor, attention_mask, output_attentions, training=False):
-        # Self attention attends to itself, thus keys and querys are the same (input_tensor).
+        # Self attention attends to itself, thus keys and queries are the same (input_tensor).
        self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions)
        if output_attentions:
            attention_probs = self_output[1]
@@ -868,7 +868,7 @@ LXMERT_START_DOCSTRING = r"""
    <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
    pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
    using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
-    question answering attribute prediction, and object tag predicition.
+    question answering attribute prediction, and object tag prediction.
    This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
@@ -962,7 +962,7 @@ LXMERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Lxmert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
    LXMERT_START_DOCSTRING,
 )
 class TFLxmertModel(TFLxmertPreTrainedModel):

--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -952,7 +952,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare MobileBert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.",
    MOBILEBERT_START_DOCSTRING,
 )
 class TFMobileBertModel(TFMobileBertPreTrainedModel):

--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -487,7 +487,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+    "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
    OPENAI_GPT_START_DOCSTRING,
 )
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):

--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
    return tuple with:
        - pytorch model weight name
-        - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each
+        - transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
          other
    """
    tf_name = tf_name.replace(":0", "")  # device ids
@@ -270,7 +270,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
    logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
    # Instantiate and load the associated TF 2.0 model
-    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
+    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beginning
    tf_model_class = getattr(transformers, tf_model_class_name)
    tf_model = tf_model_class(pt_model.config)

--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -118,9 +118,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
        Returns: tf.Tensor
        """
        mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
-        incremental_indicies = tf.math.cumsum(mask, axis=1) * mask
+        incremental_indices = tf.math.cumsum(mask, axis=1) * mask
-        return incremental_indicies + self.padding_idx
+        return incremental_indices + self.padding_idx
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
@@ -709,7 +709,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    ROBERTA_START_DOCSTRING,
 )
 class TFRobertaModel(TFRobertaPreTrainedModel):

--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -71,7 +71,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class TFT5LayerNorm(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
        """
-        Construct a layernorm module in the T5 style No bias and no substraction of mean.
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
        """
        super().__init__(**kwargs)
        self.variance_epsilon = epsilon
@@ -170,7 +170,7 @@ class TFT5Attention(tf.keras.layers.Layer):
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
-            max_distance: an intege
+            max_distance: an integer
        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
@@ -682,8 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        if self.is_decoder and encoder_attention_mask is not None:
            # If a 2D ou 3D attention mask is provided for the cross-attention
-            # we need to make broadcastabe to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
-            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
            if num_dims_encoder_attention_mask == 3:
@@ -894,7 +894,7 @@ T5_INPUTS_DOCSTRING = r"""
            sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        past_key_values (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            ontains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`

--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -800,7 +800,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
    TRANSFO_XL_START_DOCSTRING,
 )
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):

--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -145,7 +145,7 @@ class TFCausalLanguageModelingLoss:
 class TFQuestionAnsweringLoss:
    """
-    Loss function suitable for quetion answering.
+    Loss function suitable for question answering.
    """
    def compute_loss(self, labels, logits):
@@ -807,7 +807,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
    Args:
        vocab_size (:obj:`int`):
-            The size of the vocabular, e.g., the number of unique tokens.
+            The size of the vocabulary, e.g., the number of unique tokens.
        hidden_size (:obj:`int`):
            The size of the embedding vectors.
        initializer_range (:obj:`float`, `optional`):
@@ -860,7 +860,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
            :obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
            :obj:`[batch_size, length, embedding_size]`.
-            In linear mode, the ouput is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
+            In linear mode, the output is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
        Raises:
            ValueError: if :obj:`mode` is not valid.
@@ -1043,7 +1043,7 @@ def get_initializer(initializer_range: float = 0.02) -> tf.initializers.Truncate
 def cast_bool_to_primitive(bool_variable: Union[tf.Tensor, bool], default_tensor_to_true=False) -> bool:
    """
    Function arguments can be inserted as boolean tensor and bool variables to cope with Keras serialization we need to
-    cast the bool argumnets (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor.
+    cast the bool arguments (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor.
    Args:
        bool_variable (:obj:`Union[tf.Tensor, bool]`):

--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -654,7 +654,7 @@ XLM_INPUTS_DOCSTRING = r"""
            `What are position IDs? <../glossary.html#position-ids>`__
        lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
            ``[0, ..., input_ids.size(-1)]``.
        cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
            Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
@@ -688,7 +688,7 @@ XLM_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
    XLM_START_DOCSTRING,
 )
 class TFXLMModel(TFXLMPreTrainedModel):

--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -652,7 +652,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        # data mask: input mask & perm mask
        assert input_mask is None or attention_mask is None, (
            "You can only use one of input_mask (uses 1 for padding) "
-            "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+            "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
        )
        if input_mask is None and attention_mask is not None:
            input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
@@ -1122,7 +1122,7 @@ XLNET_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
    XLNET_START_DOCSTRING,
 )
 class TFXLNetModel(TFXLNetPreTrainedModel):