Unverified Commit b24ead87 authored by LSinev's avatar LSinev Committed by GitHub
Browse files

fix some typos in docs, comments, logging/errors (#11432)

parent e3e70f95
......@@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig):
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
objective.
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add the question-asansweringoss to the objective
Whether or not to add the question-answering loss to the objective
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to calculate the object-prediction loss objective
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
......
......@@ -306,9 +306,9 @@ class M2M100Attention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -252,9 +252,9 @@ class MarianAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
def __init__(self, num_embeddings: int, embedding_dim: int):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
......@@ -243,9 +243,9 @@ class MBartAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings):
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
......@@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
......
......@@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
@add_start_docstrings(
"""
MoibleBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
MOBILEBERT_START_DOCSTRING,
......
......@@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare MPNet Model transformer outputing raw hidden-states without any specific head on top.",
"The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
MPNET_START_DOCSTRING,
)
class TFMPNetModel(TFMPNetPreTrainedModel):
......
......@@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
MPNet tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`.
"""
if self._mask_token is None and self.verbose:
......
......@@ -146,7 +146,7 @@ class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False):
super().__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert n_state % config.n_head == 0
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
self.n_head = config.n_head
......@@ -178,7 +178,7 @@ class Attention(nn.Module):
w = torch.matmul(q, k)
if self.scale:
w = w / math.sqrt(v.size(-1))
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implementation method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + -1e4 * (1 - b)
......@@ -202,11 +202,11 @@ class Attention(nn.Module):
def merge_heads(self, x):
x = x.permute(0, 2, 1, 3).contiguous()
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
return x.view(*new_x_shape) # in Tensorflow implementation: fct merge_states
def split_heads(self, x, k=False):
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
x = x.view(*new_x_shape) # in Tensorflow implementation: fct split_states
if k:
return x.permute(0, 2, 3, 1)
else:
......@@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if position_ids is None:
# Code is different from when we had a single embedding matrice from position and token embeddings
# Code is different from when we had a single embedding matrix from position and token embeddings
position_ids = self.position_ids[None, : input_shape[-1]]
# Attention mask.
......@@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
pooled_logits = logits[range(batch_size), sequence_lengths]
......
......@@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer):
super().__init__(**kwargs)
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert (
n_state % config.n_head == 0
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
......
......@@ -252,9 +252,9 @@ class PegasusAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
......
......@@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module):
attn_weights = attn_weights + attention_mask
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
......@@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings.
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
......@@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings.
pre-defined word embeddings instead of randomly initialized word embeddings.
"""
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
......
......@@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
>>> # load retriever
>>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
>>> # load fine-tuned model with retriver
>>> # load fine-tuned model with retriever
>>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
"""
......
......@@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase):
Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load.
......@@ -442,7 +442,7 @@ class RagRetriever:
def init_retrieval(self):
"""
Retriever initalization function. It loads the index into memory.
Retriever initialization function. It loads the index into memory.
"""
logger.info("initializing retrieval")
......
......@@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
if isinstance(self.num_buckets, int):
assert (
self.num_buckets % 2 == 0
), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}"
), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
rotation_size = self.num_buckets
num_buckets = self.num_buckets
else:
......
......@@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set.
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`.
"""
if self._mask_token is None and self.verbose:
......
......@@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......@@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
return input_lengths
def _get_subsampled_encoder_attn_mask(self, attention_mask):
# generate creates 3D attention mask, becuase of the shape of input_features
# generate creates 3D attention mask, because of the shape of input_features
# convert it to 2D if thats the case
if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1]
......
......@@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""
# Warning messafe for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
__HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
......
......@@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
if inputs["inputs_embeds"] is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
batch_size, seq_length = input_shape
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment