Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes

Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
969859d5 · Santiago Castro · GitHub · 4731a00c · 969859d5 · 969859d5
Unverified Commit 969859d5 authored Oct 29, 2020 by Santiago Castro Committed by GitHub Oct 29, 2020
20 changed files
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -359,7 +359,7 @@ class ModuleUtilsMixin:
        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
-        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overriden for transformers with parameter
+        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overridden for transformers with parameter
        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
        Args:

--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -366,7 +366,7 @@ XLM_INPUTS_DOCSTRING = r"""
            `What are position IDs? <../glossary.html#position-ids>`__
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
            ``[0, ..., input_ids.size(-1)]``.
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
            Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the

--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -1132,7 +1132,7 @@ class XLNetModel(XLNetPreTrainedModel):
        # data mask: input mask & perm mask
        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
-        "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
+        "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
        if input_mask is None and attention_mask is not None:
            input_mask = 1.0 - attention_mask
        if input_mask is not None and perm_mask is not None:

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -97,7 +97,7 @@ def create_optimizer(
    Args:
        init_lr (:obj:`float`):
            The desired learning rate at the end of the warmup phase.
-        num_train_step (:obj:`int`):
+        num_train_steps (:obj:`int`):
            The total number of training steps.
        num_warmup_steps (:obj:`int`):
            The number of warmup steps.

--- a/src/transformers/retrieval_rag.py
+++ b/src/transformers/retrieval_rag.py
@@ -465,8 +465,6 @@ class RagRetriever:
        Postprocessing retrieved ``docs`` and combining them with ``input_strings``.
        Args:
-            doc_scores (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
-                Retrieval scores of respective docs - passed for logging.
            docs  (:obj:`dict`):
                Retrieved documents.
            input_strings (:obj:`str`):

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -293,7 +293,7 @@ class BertTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_bertweet.py
+++ b/src/transformers/tokenization_bertweet.py
@@ -223,7 +223,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -184,7 +184,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_camembert_fast.py
+++ b/src/transformers/tokenization_camembert_fast.py
@@ -191,7 +191,7 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_deberta.py
+++ b/src/transformers/tokenization_deberta.py
@@ -623,7 +623,7 @@ class DebertaTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(
                map(

--- a/src/transformers/tokenization_fsmt.py
+++ b/src/transformers/tokenization_fsmt.py
@@ -431,7 +431,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(
                map(

--- a/src/transformers/tokenization_herbert.py
+++ b/src/transformers/tokenization_herbert.py
@@ -40,7 +40,7 @@ class HerbertTokenizer(XLMTokenizer):
    Peculiarities:
-    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
+    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
      punctuation character will be treated separately.
    - Such pretokenized input is BPE subtokenized

--- a/src/transformers/tokenization_herbert_fast.py
+++ b/src/transformers/tokenization_herbert_fast.py
@@ -39,8 +39,8 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
    Peculiarities:
-    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
-      punctuation character will be treated separately.
+      a punctuation character will be treated separately.
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
@@ -122,7 +122,7 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_mbart.py
+++ b/src/transformers/tokenization_mbart.py
@@ -136,7 +136,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        prefix_ones = [1] * len(self.prefix_tokens)

--- a/src/transformers/tokenization_mbart_fast.py
+++ b/src/transformers/tokenization_mbart_fast.py
@@ -132,7 +132,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        prefix_ones = [1] * len(self.prefix_tokens)

--- a/src/transformers/tokenization_phobert.py
+++ b/src/transformers/tokenization_phobert.py
@@ -204,7 +204,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_prophetnet.py
+++ b/src/transformers/tokenization_prophetnet.py
@@ -206,7 +206,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
+                    "ids is already formatted with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -129,7 +129,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to lowercase the input when tokenizing.
        delimiter (:obj:`str`, `optional`):
-            The delimiter used btween tokens.
+            The delimiter used between tokens.
        vocab_file (:obj:`str`, `optional`):
            File containing the vocabulary (from the original implementation).
        pretrained_vocab_file (:obj:`str`, `optional`):

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -53,7 +53,7 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 def _is_whitespace(char):
    """Checks whether `char` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
+    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
@@ -367,7 +367,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        vocabulary.
        Args:
-            token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
        Returns:
            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
@@ -644,7 +644,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
        Args:
-            test (:obj:`str`):
+            text (:obj:`str`):
                The text to prepare.
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not the text has been pretokenized.
@@ -669,7 +669,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            token_ids_1 (:obj:`List[int]`, `optional`):
                List of ids of the second sequence.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
@@ -732,7 +732,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separatly for added tokens and byte-level tokens
+        # we need to build string separately for added tokens and byte-level tokens
        # cf. https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """
 Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
-fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
+fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
 of output with special method for the Fast tokenizers)
 """
@@ -537,10 +537,10 @@ class BatchEncoding(UserDict):
        Args:
            batch_or_char_index (:obj:`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                the character in the orginal string.
+                the character in the original string.
            char_index (:obj:`int`, `optional`):
                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
-                orginal string.
+                original string.
        Returns:
@@ -607,7 +607,7 @@ class BatchEncoding(UserDict):
                tensor = as_tensor(value)
-                # Removing this for now in favor of controling the shape with `prepend_batch_axis`
+                # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
                # # at-least2d
                # if tensor.ndim > 2:
                #     tensor = tensor.squeeze(0)
@@ -648,7 +648,7 @@ class SpecialTokensMixin:
    """
    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
-    used to directly access these special tokens in a model-independant manner and allow to set and update the special
+    used to directly access these special tokens in a model-independent manner and allow to set and update the special
    tokens.
    Args:
@@ -696,8 +696,8 @@ class SpecialTokensMixin:
        self.verbose = verbose
        # We directly set the hidden value to allow initialization with special tokens
-        # which are not yet in the vocabulary. Necesssary for serialization/de-serialization
+        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
-        # TODO clean this up at some point (probably by sitching to fast tokenizers)
+        # TODO clean this up at some point (probably by switching to fast tokenizers)
        for key, value in kwargs.items():
            if value is None:
                continue
@@ -721,7 +721,7 @@ class SpecialTokensMixin:
        Add the missing ones to the vocabulary if needed.
        Return:
-            :obj:`int`: The number of tokens added in the vocaulary during the operation.
+            :obj:`int`: The number of tokens added in the vocabulary during the operation.
        """
        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
@@ -805,7 +805,7 @@ class SpecialTokensMixin:
                string token to let you personalize its behavior: whether this token should only match against a single
                word, whether this token should strip all potential whitespaces on the left side, whether this token
                should strip all potential whitespaces on the right side, etc.
-            special_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Can be used to specify if the token is a special token. This mostly change the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
@@ -1799,7 +1799,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
           modifying :obj:`tokenizer.do_lower_case` after creation).
        Args:
-            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
            legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
                separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
@@ -2006,15 +2006,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        # If you only set max_length, it activates truncation for max_length
        if max_length is not None and padding is False and truncation is False:
            if verbose:
-                if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
                    logger.warning(
-                        "Truncation was not explicitely activated but `max_length` is provided a specific value, "
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
-                        "please use `truncation=True` to explicitely truncate examples to max length. "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
                        "Defaulting to 'longest_first' truncation strategy. "
                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
                        "more precisely by providing a specific strategy to `truncation`."
                    )
-                self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
            truncation = "longest_first"
        # Get padding strategy
@@ -2591,7 +2591,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
        <../glossary.html#token-type-ids>`__
-        Should be overriden in a subclass if the model has a special way of building those.
+        Should be overridden in a subclass if the model has a special way of building those.
        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
@@ -2611,7 +2611,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.
-        This implementation does not add special tokens and this method should be overriden in a subclass.
+        This implementation does not add special tokens and this method should be overridden in a subclass.
        Args:
            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
@@ -2783,7 +2783,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                and ``convert_tokens_to_ids`` methods.
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                Number of tokens to remove using the truncation strategy.
-            truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                The strategy to follow for truncation. Can be:
                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
@@ -2798,12 +2798,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            stride (:obj:`int`, `optional`, defaults to 0):
                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                sequence returned. The value of this argument defines the number of additional tokens.
@@ -2871,7 +2865,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        return_attention_mask: Optional[bool] = None,
    ) -> dict:
        """
-        Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch)
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
        Args:
            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
@@ -3037,7 +3031,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            token_ids_1 (:obj:`List[int]`, `optional`):
                List of ids of the second sequence.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formated with special tokens for the model.
+                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
@@ -3058,7 +3052,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
        """
-        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
        Args:
            out_string (:obj:`str`): The text to clean up.