fix some typos in docs, comments, logging/errors (#11432)

b24ead87 · LSinev · GitHub · e3e70f95 · b24ead87 · b24ead87
Unverified Commit b24ead87 authored Apr 26, 2021 by LSinev Committed by GitHub Apr 26, 2021
20 changed files
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig):
            Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
            objective.
        task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
+            Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
        task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to add the question-asansweringoss to the objective
+            Whether or not to add the question-answering loss to the objective
        visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not to calculate the object-prediction loss objective
        visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):

--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -306,9 +306,9 @@ class M2M100Attention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -252,9 +252,9 @@ class MarianAttention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
+        # and adjust num_embeddings appropriately. Other models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

@@ -243,9 +243,9 @@ class MBartAttention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings):

    def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
+        # and adjust num_embeddings appropriately. Other models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)

@@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:

                - 1 indicates the head is **not masked**,
-                - 0 indicates the heas is **masked**.
+                - 0 indicates the head is **masked**.

            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded

--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):

 @add_start_docstrings(
    """
-    MoibleBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    MOBILEBERT_START_DOCSTRING,

--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r"""


 @add_start_docstrings(
-    "The bare MPNet Model transformer outputing raw hidden-states without any specific head on top.",
+    "The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
    MPNET_START_DOCSTRING,
 )
 class TFMPNetModel(TFMPNetPreTrainedModel):

--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.

-        MPNet tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
+        MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
        comprise the space before the `<mask>`.
        """
        if self._mask_token is None and self.verbose:

--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -146,7 +146,7 @@ class Attention(nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False):
        super().__init__()
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
        assert n_state % config.n_head == 0
        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
        self.n_head = config.n_head
@@ -178,7 +178,7 @@ class Attention(nn.Module):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
-        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implementation method: mask_attn_weights
        # XD: self.b may be larger than w, so we need to crop it
        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
        w = w * b + -1e4 * (1 - b)
@@ -202,11 +202,11 @@ class Attention(nn.Module):
    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+        return x.view(*new_x_shape)  # in Tensorflow implementation: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        x = x.view(*new_x_shape)  # in Tensorflow implementation: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)
        else:
@@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if position_ids is None:
-            # Code is different from when we had a single embedding matrice from position and token embeddings
+            # Code is different from when we had a single embedding matrix  from position and token embeddings
            position_ids = self.position_ids[None, : input_shape[-1]]

        # Attention mask.
@@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer):
        super().__init__(**kwargs)

        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
        assert (
            n_state % config.n_head == 0
        ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"

--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -252,9 +252,9 @@ class PegasusAttention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)

--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:

                - 1 indicates the head is **not masked**,
-                - 0 indicates the heas is **masked**.
+                - 0 indicates the head is **masked**.

            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded

--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module):
            attn_weights = attn_weights + attention_mask

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
@@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
    r"""
    word_embeddings  (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
        The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
-        pre-defined word embeddings instead of randomely initialized word embeddings.
+        pre-defined word embeddings instead of randomly initialized word embeddings.
    """

    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
@@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
    r"""
    word_embeddings  (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
        The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
-        pre-defined word embeddings instead of randomely initialized word embeddings.
+        pre-defined word embeddings instead of randomly initialized word embeddings.
    """

    def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):

--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):

            >>> # load retriever
            >>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
-            >>> # load fine-tuned model with retriver
+            >>> # load fine-tuned model with retriever
            >>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
        """


--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase):
    Args:
        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
        dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
-            A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
+            A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
            with ``datasets.list_datasets()``).
        dataset_split (:obj:`str`, optional, defaults to ``train``)
            Which split of the ``dataset`` to load.
@@ -442,7 +442,7 @@ class RagRetriever:

    def init_retrieval(self):
        """
-        Retriever initalization function. It loads the index into memory.
+        Retriever initialization function. It loads the index into memory.
        """

        logger.info("initializing retrieval")

--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
        if isinstance(self.num_buckets, int):
            assert (
                self.num_buckets % 2 == 0
-            ), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}"
+            ), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
            rotation_size = self.num_buckets
            num_buckets = self.num_buckets
        else:

--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
        not having been set.

-        Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
+        Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
        comprise the space before the `<mask>`.
        """
        if self._mask_token is None and self.verbose:

--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module):
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
+            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
@@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
        return input_lengths

    def _get_subsampled_encoder_attn_mask(self, attention_mask):
-        # generate creates 3D attention mask, becuase of the shape of input_features
+        # generate creates 3D attention mask, because of the shape of input_features
        # convert it to 2D if thats the case
        if len(attention_mask.shape) > 2:
            attention_mask = attention_mask[:, :, -1]

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
 """

-# Warning messafe for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
 __HEAD_MASK_WARNING_MSG = """
 The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
 `decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.

--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")

        if inputs["inputs_embeds"] is None:
-            assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
            inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])

        batch_size, seq_length = input_shape