fix some typos in docs, comments, logging/errors (#11432)

b24ead87 · LSinev · GitHub · e3e70f95 · b24ead87 · b24ead87
Unverified Commit b24ead87 authored Apr 26, 2021 by LSinev Committed by GitHub Apr 26, 2021
17 changed files
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -2021,7 +2021,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
    apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation
    case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the
    aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold
-    for this is a hyperparameter `cell_selection_preference
+    for this is a hyperparameter `cell_selection_preference`
    Args:
        answer (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, )`):
@@ -2041,7 +2041,7 @@ def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference,
    aggregate_mask_init = torch.logical_not(torch.isnan(answer)).type(torch.FloatTensor).to(answer.device)
    logits_aggregation = aggregation_classifier(pooled_output)
    dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
-    # Index 0 correponds to "no aggregation".
+    # Index 0 corresponds to "no aggregation".
    aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
    # Cell selection examples according to current model.
@@ -2126,7 +2126,7 @@ def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask):
        answer supervision) per example.
    """
    dist_aggregation = torch.distributions.categorical.Categorical(logits=logits_aggregation)
-    # Index 0 correponds to "no aggregation".
+    # Index 0 corresponds to "no aggregation".
    aggregation_ops_total_mass = torch.sum(dist_aggregation.probs[:, 1:], dim=1)
    # Predict some aggregation in case of an answer that needs aggregation.
    # This increases the probability of all aggregation functions, in a way

--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -2357,7 +2357,7 @@ def _get_numeric_value_from_date(date, mask):
 def _get_span_length_key(span):
-    """Sorts span by decreasing length first and incresing first index second."""
+    """Sorts span by decreasing length first and increasing first index second."""
    return span[1] - span[0], -span[0]

--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -292,7 +292,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        elif "<unk>" in self.sym2idx:
            self.unk_idx = self.sym2idx["<unk>"]
        else:
-            raise ValueError("No <unkown> token in vocabulary")
+            raise ValueError("No <unknown> token in vocabulary")
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if os.path.isdir(save_directory):

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -382,9 +382,9 @@ class Wav2Vec2Attention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -111,7 +111,7 @@ class Wav2Vec2Processor:
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
        :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
        :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
+        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the docstring of
        the above two methods for more information.
        """
        return self.current_processor(*args, **kwargs)

--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -348,7 +348,7 @@ XLM_INPUTS_DOCSTRING = r"""
            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
            languages ids which can be obtained from the language names by using two conversion mappings provided in
            the configuration of the model (only provided for multilingual models). More precisely, the `language name
-            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
+            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
            `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
            See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
@@ -1188,7 +1188,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
        self.init_weights()
-    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choicec, sequence_length"))
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        tokenizer_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,

--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -749,8 +749,8 @@ class XLMTokenizer(PreTrainedTokenizer):
    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
        """
-        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizer.
-        tokenizerself. Otherwise, we use Moses.
+        Otherwise, we use Moses.
        Details of tokenization:

--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -1113,7 +1113,7 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **masked**,
-            - 0 for tokens that are **not maked**.
+            - 0 for tokens that are **not masked**.
            You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -182,7 +182,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
        name (:obj:`str`, `optional`, defaults to 'AdamWeightDecay'):
            Optional name for the operations created when applying gradients.
        kwargs:
-            Keyward arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
+            Keyword arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
            gradients by norm; ``clipvalue`` is clip gradients by value, ``decay`` is included for backward
            compatibility to allow time inverse decay of learning rate. ``lr`` is included for backward compatibility,
            recommended to use ``learning_rate`` instead.

--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -128,7 +128,7 @@ class Conversation:
        """
        Iterates over all blobs of the conversation.
-        Retuns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
+        Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
        :obj:`bool`, ``text_chunks`` is a :obj:`str`.
        """
        for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):

--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -48,7 +48,7 @@ class Text2TextGenerationPipeline(Pipeline):
    def check_inputs(self, input_length: int, min_length: int, max_length: int):
        """
-        Checks wether there might be something wrong with given input with regard to the model.
+        Checks whether there might be something wrong with given input with regard to the model.
        """
        return True
@@ -204,7 +204,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
    def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
        """
-        Checks wether there might be something wrong with given input with regard to the model.
+        Checks whether there might be something wrong with given input with regard to the model.
        """
        if input_length < min_length // 2:
            logger.warning(

--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -520,7 +520,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
        """
-        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well asin a unique JSON
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        """
        save_directory = str(save_directory)

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -208,7 +208,7 @@ class Trainer:
            Note that if it's a :obj:`torch.utils.data.dataset.IterableDataset` with some randomization and you are
            training in a distributed fashion, your iterable dataset should either use a internal attribute
-            :obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identic on all
+            :obj:`generator` that is a :obj:`torch.Generator` for the randomization that must be identical on all
            processes (and the Trainer will manually set the seed of this :obj:`generator` at each epoch) or have a
            :obj:`set_epoch()` method that internally sets the seed of the RNGs used.
        eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):

--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -42,7 +42,7 @@ class TrainerState:
        In all this class, one step is to be understood as one update step. When using gradient accumulation, one
        update step may require several forward and backward passes: if you use :obj:`gradient_accumulation_steps=n`,
-        then one update step requires going throuch `n` batches.
+        then one update step requires going through `n` batches.
    Args:
        epoch (:obj:`float`, `optional`):

--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -243,7 +243,7 @@ class SequentialDistributedSampler(Sampler):
    def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
        warnings.warn(
-            "SequentialDistributedSampler is deprecated and will be removed in v5 of Tranformers.",
+            "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        if num_replicas is None:
@@ -363,7 +363,7 @@ class DistributedTensorGatherer:
    def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
        warnings.warn(
-            "DistributedTensorGatherer is deprecated and will be removed in v5 of Tranformers.",
+            "DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.",
            FutureWarning,
        )
        self.world_size = world_size

--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -199,7 +199,7 @@ class Seq2SeqTrainer(Trainer):
    def _pad_tensors_to_max_len(self, tensor, max_length):
        if self.tokenizer is None:
            raise ValueError(
-                f"Tensor need to be padded to `max_length={max_length}` but no tokenzier was passed when creating "
+                f"Tensor need to be padded to `max_length={max_length}` but no tokenizer was passed when creating "
                "this `Trainer`. Make sure to create your `Trainer` with the appropriate tokenizer."
            )
        # If PAD token is not defined at least EOS token has to be defined

--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -140,7 +140,7 @@ def get_verbosity() -> int:
 def set_verbosity(verbosity: int) -> None:
    """
-    Set the vebosity level for the 🤗 Transformers's root logger.
+    Set the verbosity level for the 🤗 Transformers's root logger.
    Args:
        verbosity (:obj:`int`):