Fix many typos (#8708)

e1f3156b · Santiago Castro · GitHub · 9c0afdaf · e1f3156b · e1f3156b
Unverified Commit e1f3156b authored Nov 22, 2020 by Santiago Castro Committed by GitHub Nov 21, 2020
15 changed files
--- a/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md
+++ b/model_cards/mrm8488/t5-base-finetuned-wikiSQL-sql-to-en/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the Dataset 📚 
-Dataset ID: ```wikisql``` from  [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
+Dataset ID: ```wikisql``` from  [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md
+++ b/model_cards/mrm8488/t5-base-finetuned-wikiSQL/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the Dataset 📚 
-Dataset ID: ```wikisql``` from  [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
+Dataset ID: ```wikisql``` from  [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/model_cards/mrm8488/t5-small-finetuned-quora-for-paraphrasing/README.md
+++ b/model_cards/mrm8488/t5-small-finetuned-quora-for-paraphrasing/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the downstream task (Question Paraphrasing) - Dataset 📚❓↔️❓
-Dataset ID: ```quora``` from  [HugginFace/NLP](https://github.com/huggingface/nlp)
+Dataset ID: ```quora``` from  [Huggingface/NLP](https://github.com/huggingface/nlp)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/model_cards/mrm8488/t5-small-finetuned-squadv1/README.md
+++ b/model_cards/mrm8488/t5-small-finetuned-squadv1/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
-Dataset ID: ```squad``` from  [HugginFace/NLP](https://github.com/huggingface/nlp)
+Dataset ID: ```squad``` from  [Huggingface/NLP](https://github.com/huggingface/nlp)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/model_cards/mrm8488/t5-small-finetuned-squadv2/README.md
+++ b/model_cards/mrm8488/t5-small-finetuned-squadv2/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
-Dataset ID: ```squad_v2``` from  [HugginFace/NLP](https://github.com/huggingface/nlp)
+Dataset ID: ```squad_v2``` from  [Huggingface/NLP](https://github.com/huggingface/nlp)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md
+++ b/model_cards/mrm8488/t5-small-finetuned-wikiSQL/README.md
@@ -19,7 +19,7 @@ Transfer learning, where a model is first pre-trained on a data-rich task before
 ## Details of the Dataset 📚 
-Dataset ID: ```wikisql``` from  [HugginFace/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
+Dataset ID: ```wikisql``` from  [Huggingface/NLP](https://huggingface.co/nlp/viewer/?dataset=wikisql)
 | Dataset  | Split | # samples |
 | -------- | ----- | --------- |

--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
    return tuple with:
        - pytorch model weight name
-        - transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
+        - transpose: boolean indicating whether TF2.0 and PyTorch weights matrices are transposed with regards to each
          other
    """
    tf_name = tf_name.replace(":0", "")  # device ids

--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -951,7 +951,7 @@ class FSMTModel(PretrainedFSMTModel):
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -642,7 +642,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
        if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs_embeds = self.embed_tokens(input_ids)
        batch_size, seq_length = input_shape

--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -667,9 +667,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput):
    @property
    def logits(self):
-        # prediciton scores are the output of the adaptive softmax, see
+        # prediction scores are the output of the adaptive softmax, see
        # the file `modeling_transfo_xl_utilities`. Since the adaptive
-        # softmax returns the log softmax value, `self.prediciton_scores`
+        # softmax returns the log softmax value, `self.prediction_scores`
        # are strictly speaking not exactly `logits`, but behave the same
        # way logits do.
        return self.prediction_scores
@@ -886,7 +886,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
+            )  # switch to float if need + fp16 compatibility
        else:
            head_mask = [None] * self.n_layer

--- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
@@ -91,8 +91,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        Return:
            if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
-            [(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if
+            [(len-1)*bsz] Negative log likelihood. We could replace this implementation by the native PyTorch one if
-            their's had an option to set bias on all clusters in the native one. here:
+            theirs had an option to set bias on all clusters in the native one. here:
            https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        """

--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -633,11 +633,11 @@ XLM_INPUTS_DOCSTRING = r"""
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the `language name
-            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
+            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
            `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
            See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
-        ttoken_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
+        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
            1]``:

--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -54,7 +54,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class XLMRobertaTokenizer(PreTrainedTokenizer):
    """
-    Adapted from :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on
+    Adapted from :class:`~transformers.RobertaTokenizer` and class:`~transformers.XLNetTokenizer`. Based on
    `SentencePiece <https://github.com/google/sentencepiece>`__.
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.

--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -904,7 +904,7 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **masked**,
-            - 0 for tokens that are **not maked**.
+            - 0 for tokens that are **not masked**.
            You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
@@ -1211,7 +1211,7 @@ class XLNetModel(XLNetPreTrainedModel):
                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
            head_mask = head_mask.to(
                dtype=next(self.parameters()).dtype
-            )  # switch to fload if need + fp16 compatibility
+            )  # switch to float if need + fp16 compatibility
        else:
            head_mask = [None] * self.n_layer

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -167,9 +167,9 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
        beta_2 (:obj:`float`, `optional`, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (:obj:`float`, `optional`, defaults to 1e-7):
-            The epsilon paramenter in Adam, which is a small constant for numerical stability.
+            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (:obj:`bool`, `optional`, default to `False`):
-            Whether to apply AMSGrad varient of this algorithm or not, see `On the Convergence of Adam and Beyond
+            Whether to apply AMSGrad variant of this algorithm or not, see `On the Convergence of Adam and Beyond
            <https://arxiv.org/abs/1904.09237>`__.
        weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
            The weight decay to apply.