Unverified Commit b24ead87 authored by LSinev's avatar LSinev Committed by GitHub
Browse files

fix some typos in docs, comments, logging/errors (#11432)

parent e3e70f95
......@@ -451,7 +451,7 @@ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domain
the class Tokenizer.
4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
is set to False, then the tokenizer will downcase everything except for emoticons.
is set to False, then the tokenizer will lowercase everything except for emoticons.
"""
......
......@@ -724,7 +724,7 @@ class BigBirdBlockSparseAttention(nn.Module):
band_product, dim=-1
) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
# contibution of sliding keys
# contribution of sliding keys
# [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
context_layer = self.torch_bmm_nd(
attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
......@@ -876,7 +876,7 @@ class BigBirdBlockSparseAttention(nn.Module):
attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view(
bsz, n_heads, from_block_size, 3, to_block_size
) # inner_band_product
# global keys (correspomding to 1st key block)
# global keys (corresponding to 1st key block)
attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
:, :, :, :, :to_block_size
].view(
......@@ -946,7 +946,7 @@ class BigBirdBlockSparseAttention(nn.Module):
@staticmethod
def torch_gather_b2(params, indices):
# this operation is equilvalent to tf.gather when batch_dims=2
# this operation is equivalent to tf.gather when batch_dims=2
if params.shape[:2] != indices.shape[:2]:
raise ValueError(
......@@ -1054,7 +1054,7 @@ class BigBirdBlockSparseAttention(nn.Module):
to_block_size: int. size of block in to sequence.
num_rand_blocks: int. Number of random chunks per row.
last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
if positive then num_rand_blocks blocks choosen only upto last_idx.
if positive then num_rand_blocks blocks chosen only up to last_idx.
Returns:
adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
......@@ -1149,7 +1149,7 @@ class BigBirdBlockSparseAttention(nn.Module):
plan_block_length = np.array(plan_from_length) // from_block_size
# till when to follow plan
max_plan_idx = plan_from_length.index(from_seq_length)
# Random Attention adjajency list
# Random Attention adjacency list
rand_attn = [
np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
for i in range(num_heads)
......@@ -1246,8 +1246,8 @@ class BigBirdBlockSparseAttention(nn.Module):
Args:
block_id: int. block id of row.
to_start_block_id: int. random attention coloum start id.
to_end_block_id: int. random attention coloum end id.
to_start_block_id: int. random attention column start id.
to_end_block_id: int. random attention column end id.
num_rand_blocks: int. number of random blocks to be selected.
window_block_left: int. number of blocks of window to left of a block.
window_block_right: int. number of blocks of window to right of a block.
......@@ -1825,7 +1825,7 @@ BIG_BIRD_INPUTS_DOCSTRING = r"""
@dataclass
class BigBirdForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.BigBirdtForPreTraining`.
Output type of :class:`~transformers.BigBirdForPreTraining`.
Args:
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
......@@ -2941,7 +2941,7 @@ class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
logits_mask = None
if question_lengths is not None:
# setting lengths logits to `-infi`
# setting lengths logits to `-inf`
logits_mask = self.prepare_question_mask(question_lengths, seqlen)
if token_type_ids is None:
token_type_ids = (~logits_mask).long()
......
......@@ -237,9 +237,9 @@ class BlenderbotAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -235,9 +235,9 @@ class BlenderbotSmallAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
......@@ -695,7 +695,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **maked**.
- 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
......@@ -739,7 +739,7 @@ CONVBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings(
"The bare ConvBERT Model transformer outputing raw hidden-states without any specific head on top.",
"The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.",
CONVBERT_START_DOCSTRING,
)
class TFConvBertModel(TFConvBertPreTrainedModel):
......
......@@ -683,7 +683,7 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel):
sequence_lengths = -1
logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
)
pooled_logits = logits[range(batch_size), sequence_lengths]
......
......@@ -511,7 +511,7 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-
query_size (int): the length of query
key_size (int): the length of key
bucket_size (int): the size of position bucket
max_position (int): the maxium allowed absolute positoin
max_position (int): the maximum allowed absolute position
Return:
:obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
......@@ -698,7 +698,7 @@ class DisentangledSelfAttention(torch.nn.Module):
relative_pos = relative_pos.unsqueeze(1)
# bsz x height x query x key
elif relative_pos.dim() != 4:
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
att_span = self.pos_ebd_size
relative_pos = relative_pos.long().to(query_layer.device)
......
......@@ -428,7 +428,7 @@ class SPMTokenizer:
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# \t, \n, and \r are technically control characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
......
......@@ -134,7 +134,7 @@ def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder
f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))
# detect whether this is a do_lower_case situation, which can be derived by checking whether we
# have at least one upcase letter in the source vocab
# have at least one uppercase letter in the source vocab
do_lower_case = True
for k in src_vocab.keys():
if not k.islower():
......
......@@ -252,7 +252,7 @@ FSMT_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
......@@ -486,7 +486,7 @@ class FSMTEncoder(nn.Module):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
Returns:
BaseModelOutput or Tuple comprised of:
......@@ -696,7 +696,7 @@ class FSMTDecoder(nn.Module):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
......
......@@ -184,7 +184,7 @@ class FunnelAttentionStructure(nn.Module):
self.sin_dropout = nn.Dropout(config.hidden_dropout)
self.cos_dropout = nn.Dropout(config.hidden_dropout)
# Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
# dividide.
# divided.
self.pooling_mult = None
def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None):
......@@ -218,7 +218,7 @@ class FunnelAttentionStructure(nn.Module):
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
final formula.
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
formula.
Paper link: https://arxiv.org/abs/2006.03236
......
......@@ -169,7 +169,7 @@ class TFFunnelAttentionStructure:
For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
final formula.
For the relative shif attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
formula.
Paper link: https://arxiv.org/abs/2006.03236
......@@ -1009,7 +1009,7 @@ class TFFunnelForPreTrainingOutput(ModelOutput):
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.ensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
shape :obj:`(batch_size, sequence_length, hidden_size)`.
......
......@@ -70,7 +70,7 @@ class TFAttention(tf.keras.layers.Layer):
super().__init__(**kwargs)
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert n_state % config.n_head == 0
self.n_ctx = n_ctx
self.n_head = config.n_head
......
......@@ -70,7 +70,7 @@ def bytes_to_unicode():
The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
"""
bs = (
......@@ -189,7 +189,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.cache = {}
self.add_prefix_space = add_prefix_space
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@property
......@@ -247,7 +247,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
for token in re.findall(self.pat, text):
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
......
......@@ -221,7 +221,7 @@ class GPTNeoAttentionMixin:
if attention_mask is None:
attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=device)
# A block can also be padded becuase of the _look_back operation
# A block can also be padded because of the _look_back operation
# look back into the attention_block such that it will also get padded the same way
# and have 0s in the padded position
attention_mask = GPTNeoAttentionMixin._look_back(attention_mask, block_length, window_size, is_key_value=False)
......@@ -804,8 +804,8 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x num_headss x N x N
# head_mask has shape n_layer x batch x num_headss x N x N
# attention_probs has shape bsz x num_heads x N x N
# head_mask has shape n_layer x batch x num_heads x N x N
head_mask = self.get_head_mask(head_mask, self.config.num_layers)
if inputs_embeds is None:
......
......@@ -35,11 +35,11 @@ class QuantEmbedding(nn.Module):
:obj:`torch.nn.Embedding`.
Args:
weight_bit (:obj:`int`, `optiona`l, defaults to :obj:`8`):
weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
Bitwidth for the quantized weight.
momentum (:obj:`float`, `optional, defaults to :obj:`0.95`):
momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
Momentum for updating the activation quantization range.
quant_mode (:obj:`bool`, `optional, defaults to :obj:`False`):
quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the layer is quantized.
"""
......@@ -619,7 +619,7 @@ def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_ma
`saturation_max`.
"""
# in this part, we do not need any gradient computation,
# in order to enfore this, we put torch.no_grad()
# in order to enforce this, we put torch.no_grad()
with torch.no_grad():
n = 2 ** (num_bits - 1) - 1
......
......@@ -842,9 +842,9 @@ class LEDDecoderAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......@@ -1499,7 +1499,7 @@ LED_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
......@@ -1689,7 +1689,7 @@ class LEDEncoder(LEDPreTrainedModel):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
......@@ -1920,7 +1920,7 @@ class LEDDecoder(LEDPreTrainedModel):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
......
......@@ -869,7 +869,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
# compute global attn probs
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
# apply layer head maskin
# apply layer head masking
if layer_head_mask is not None:
if tf.executing_eagerly():
tf.debugging.assert_equal(
......@@ -1552,7 +1552,7 @@ LED_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
......@@ -1667,7 +1667,7 @@ class TFLEDEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
......@@ -1926,14 +1926,14 @@ class TFLEDDecoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
encoder_head_mask (:obj:`tf.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
on hidden heads. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
......
......@@ -393,7 +393,7 @@ class LongformerTokenClassifierOutput(ModelOutput):
def _get_question_end_index(input_ids, sep_token_id):
"""
Computes the index of the first occurance of `sep_token_id`.
Computes the index of the first occurrence of `sep_token_id`.
"""
sep_token_indices = (input_ids == sep_token_id).nonzero()
......@@ -1428,7 +1428,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
......
......@@ -1388,7 +1388,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
# compute global attn probs
global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1)
# apply layer head maskin
# apply layer head masking
if layer_head_mask is not None:
if tf.executing_eagerly():
tf.debugging.assert_equal(
......@@ -1707,7 +1707,7 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
inputs["attention_mask"], (attention_mask_shape[0], attention_mask_shape[1], 1, 1)
)
# Since attention_mask is 1.0 for positions we want to locall attend locally and 0.0 for
# Since attention_mask is 1.0 for positions we want to attend locally and 0.0 for
# masked and global attn positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
......@@ -1920,7 +1920,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**.
- 0 indicates the head is **masked**.
global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment