Unverified Commit b24ead87 authored by LSinev's avatar LSinev Committed by GitHub
Browse files

fix some typos in docs, comments, logging/errors (#11432)

parent e3e70f95
...@@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig): ...@@ -95,9 +95,9 @@ class LxmertConfig(PretrainedConfig):
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
objective. objective.
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`): task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective. Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`): task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add the question-asansweringoss to the objective Whether or not to add the question-answering loss to the objective
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`): visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to calculate the object-prediction loss objective Whether or not to calculate the object-prediction loss objective
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`): visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
......
...@@ -306,9 +306,9 @@ class M2M100Attention(nn.Module): ...@@ -306,9 +306,9 @@ class M2M100Attention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
...@@ -252,9 +252,9 @@ class MarianAttention(nn.Module): ...@@ -252,9 +252,9 @@ class MarianAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
...@@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding): ...@@ -117,7 +117,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
def __init__(self, num_embeddings: int, embedding_dim: int): def __init__(self, num_embeddings: int, embedding_dim: int):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2 # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack # and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2 self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim) super().__init__(num_embeddings + self.offset, embedding_dim)
...@@ -243,9 +243,9 @@ class MBartAttention(nn.Module): ...@@ -243,9 +243,9 @@ class MBartAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
...@@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings): ...@@ -118,7 +118,7 @@ class TFMBartLearnedPositionalEmbedding(TFSharedEmbeddings):
def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs): def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2 # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack # and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2 self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs) super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
...@@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer): ...@@ -690,7 +690,7 @@ class TFMBartEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**. - 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
......
...@@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1487,7 +1487,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
""" """
MoibleBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. MobileBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks. for Named-Entity-Recognition (NER) tasks.
""", """,
MOBILEBERT_START_DOCSTRING, MOBILEBERT_START_DOCSTRING,
......
...@@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r""" ...@@ -674,7 +674,7 @@ MPNET_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare MPNet Model transformer outputing raw hidden-states without any specific head on top.", "The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.",
MPNET_START_DOCSTRING, MPNET_START_DOCSTRING,
) )
class TFMPNetModel(TFMPNetPreTrainedModel): class TFMPNetModel(TFMPNetPreTrainedModel):
......
...@@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -154,7 +154,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set. not having been set.
MPNet tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`. comprise the space before the `<mask>`.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None and self.verbose:
......
...@@ -146,7 +146,7 @@ class Attention(nn.Module): ...@@ -146,7 +146,7 @@ class Attention(nn.Module):
def __init__(self, nx, n_ctx, config, scale=False): def __init__(self, nx, n_ctx, config, scale=False):
super().__init__() super().__init__()
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem] # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert n_state % config.n_head == 0 assert n_state % config.n_head == 0
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
self.n_head = config.n_head self.n_head = config.n_head
...@@ -178,7 +178,7 @@ class Attention(nn.Module): ...@@ -178,7 +178,7 @@ class Attention(nn.Module):
w = torch.matmul(q, k) w = torch.matmul(q, k)
if self.scale: if self.scale:
w = w / math.sqrt(v.size(-1)) w = w / math.sqrt(v.size(-1))
# w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # w = w * self.bias + -1e9 * (1 - self.bias) # TF implementation method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it # XD: self.b may be larger than w, so we need to crop it
b = self.bias[:, :, : w.size(-2), : w.size(-1)] b = self.bias[:, :, : w.size(-2), : w.size(-1)]
w = w * b + -1e4 * (1 - b) w = w * b + -1e4 * (1 - b)
...@@ -202,11 +202,11 @@ class Attention(nn.Module): ...@@ -202,11 +202,11 @@ class Attention(nn.Module):
def merge_heads(self, x): def merge_heads(self, x):
x = x.permute(0, 2, 1, 3).contiguous() x = x.permute(0, 2, 1, 3).contiguous()
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states return x.view(*new_x_shape) # in Tensorflow implementation: fct merge_states
def split_heads(self, x, k=False): def split_heads(self, x, k=False):
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states x = x.view(*new_x_shape) # in Tensorflow implementation: fct split_states
if k: if k:
return x.permute(0, 2, 3, 1) return x.permute(0, 2, 3, 1)
else: else:
...@@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -467,7 +467,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
raise ValueError("You have to specify either input_ids or inputs_embeds") raise ValueError("You have to specify either input_ids or inputs_embeds")
if position_ids is None: if position_ids is None:
# Code is different from when we had a single embedding matrice from position and token embeddings # Code is different from when we had a single embedding matrix from position and token embeddings
position_ids = self.position_ids[None, : input_shape[-1]] position_ids = self.position_ids[None, : input_shape[-1]]
# Attention mask. # Attention mask.
...@@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel): ...@@ -814,7 +814,7 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel):
sequence_lengths = -1 sequence_lengths = -1
logger.warning( logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`" f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
) )
pooled_logits = logits[range(batch_size), sequence_lengths] pooled_logits = logits[range(batch_size), sequence_lengths]
......
...@@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer): ...@@ -62,7 +62,7 @@ class TFAttention(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
n_state = nx # in Attention: n_state=768 (nx=n_embd) n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem] # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
assert ( assert (
n_state % config.n_head == 0 n_state % config.n_head == 0
), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}" ), f"Hidden dimension {n_state} not dividable by number of heads {config.n_head}"
......
...@@ -252,9 +252,9 @@ class PegasusAttention(nn.Module): ...@@ -252,9 +252,9 @@ class PegasusAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
......
...@@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer): ...@@ -719,7 +719,7 @@ class TFPegasusEncoder(tf.keras.layers.Layer):
Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the heas is **masked**. - 0 indicates the head is **masked**.
inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
......
...@@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module): ...@@ -723,9 +723,9 @@ class ProphetNetAttention(nn.Module):
attn_weights = attn_weights + attention_mask attn_weights = attn_weights + attention_mask
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(batch_size, self.num_attn_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(batch_size * self.num_attn_heads, tgt_len, src_len)
...@@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel): ...@@ -1243,7 +1243,7 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
r""" r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`): word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings. pre-defined word embeddings instead of randomly initialized word embeddings.
""" """
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None): def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
...@@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel): ...@@ -1380,7 +1380,7 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
r""" r"""
word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`): word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
pre-defined word embeddings instead of randomely initialized word embeddings. pre-defined word embeddings instead of randomly initialized word embeddings.
""" """
def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None): def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None):
......
...@@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel): ...@@ -285,7 +285,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
>>> # load retriever >>> # load retriever
>>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True) >>> retriever = RagRetriever.from_pretrained(PATH, index_name="exact", use_dummy_dataset=True)
>>> # load fine-tuned model with retriver >>> # load fine-tuned model with retriever
>>> model = TFRagModel.from_pretrained("./rag", retriever=retriever) >>> model = TFRagModel.from_pretrained("./rag", retriever=retriever)
""" """
......
...@@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase): ...@@ -234,7 +234,7 @@ class CanonicalHFIndex(HFIndexBase):
Args: Args:
vector_size (:obj:`int`): the dimension of the passages embeddings used by the index vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``): dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
with ``datasets.list_datasets()``). with ``datasets.list_datasets()``).
dataset_split (:obj:`str`, optional, defaults to ``train``) dataset_split (:obj:`str`, optional, defaults to ``train``)
Which split of the ``dataset`` to load. Which split of the ``dataset`` to load.
...@@ -442,7 +442,7 @@ class RagRetriever: ...@@ -442,7 +442,7 @@ class RagRetriever:
def init_retrieval(self): def init_retrieval(self):
""" """
Retriever initalization function. It loads the index into memory. Retriever initialization function. It loads the index into memory.
""" """
logger.info("initializing retrieval") logger.info("initializing retrieval")
......
...@@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): ...@@ -612,7 +612,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
if isinstance(self.num_buckets, int): if isinstance(self.num_buckets, int):
assert ( assert (
self.num_buckets % 2 == 0 self.num_buckets % 2 == 0
), f"There should be an even number of bucktes, but `self.num_bucktes`: {self.num_buckets}" ), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
rotation_size = self.num_buckets rotation_size = self.num_buckets
num_buckets = self.num_buckets num_buckets = self.num_buckets
else: else:
......
...@@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -179,7 +179,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
not having been set. not having been set.
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the `<mask>`. comprise the space before the `<mask>`.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None and self.verbose:
......
...@@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module): ...@@ -319,9 +319,9 @@ class Speech2TextAttention(nn.Module):
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions: if output_attentions:
# this operation is a bit akward, but it's required to # this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient. # make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped # In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following # twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
...@@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel): ...@@ -559,7 +559,7 @@ class Speech2TextPreTrainedModel(PreTrainedModel):
return input_lengths return input_lengths
def _get_subsampled_encoder_attn_mask(self, attention_mask): def _get_subsampled_encoder_attn_mask(self, attention_mask):
# generate creates 3D attention mask, becuase of the shape of input_features # generate creates 3D attention mask, because of the shape of input_features
# convert it to 2D if thats the case # convert it to 2D if thats the case
if len(attention_mask.shape) > 2: if len(attention_mask.shape) > 2:
attention_mask = attention_mask[:, :, -1] attention_mask = attention_mask[:, :, -1]
......
...@@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r""" ...@@ -1172,7 +1172,7 @@ T5_ENCODER_INPUTS_DOCSTRING = r"""
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
""" """
# Warning messafe for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask # Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
__HEAD_MASK_WARNING_MSG = """ __HEAD_MASK_WARNING_MSG = """
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently, The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions. `decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
......
...@@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -637,7 +637,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds") raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
if inputs["inputs_embeds"] is None: if inputs["inputs_embeds"] is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"]) inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
batch_size, seq_length = input_shape batch_size, seq_length = input_shape
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment