Unverified Commit 969859d5 authored by Santiago Castro's avatar Santiago Castro Committed by GitHub
Browse files

Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board

* Fix a typo

* Fix the CI

* Fix more typos

* Fix CI

* More fixes

* Fix CI

* More fixes

* More fixes
parent 4731a00c
...@@ -44,7 +44,7 @@ _CONFIG_FOR_DOC = "T5Config" ...@@ -44,7 +44,7 @@ _CONFIG_FOR_DOC = "T5Config"
_TOKENIZER_FOR_DOC = "T5Tokenizer" _TOKENIZER_FOR_DOC = "T5Tokenizer"
#################################################### ####################################################
# This dict contrains shortcut names and associated url # This dict contains shortcut names and associated url
# for the pretrained weights provided with the models # for the pretrained weights provided with the models
#################################################### ####################################################
T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -156,7 +156,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): ...@@ -156,7 +156,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
class T5LayerNorm(nn.Module): class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6): def __init__(self, hidden_size, eps=1e-6):
""" """
Construct a layernorm module in the T5 style No bias and no substraction of mean. Construct a layernorm module in the T5 style No bias and no subtraction of mean.
""" """
super().__init__() super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size)) self.weight = nn.Parameter(torch.ones(hidden_size))
...@@ -256,7 +256,7 @@ class T5Attention(nn.Module): ...@@ -256,7 +256,7 @@ class T5Attention(nn.Module):
relative_position: an int32 Tensor relative_position: an int32 Tensor
bidirectional: a boolean - whether the attention is bidirectional bidirectional: a boolean - whether the attention is bidirectional
num_buckets: an integer num_buckets: an integer
max_distance: an intege max_distance: an integer
Returns: Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
...@@ -705,7 +705,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -705,7 +705,7 @@ class T5Stack(T5PreTrainedModel):
raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds") raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
if inputs_embeds is None: if inputs_embeds is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
inputs_embeds = self.embed_tokens(input_ids) inputs_embeds = self.embed_tokens(input_ids)
batch_size, seq_length = input_shape batch_size, seq_length = input_shape
......
...@@ -739,7 +739,7 @@ ALBERT_INPUTS_DOCSTRING = r""" ...@@ -739,7 +739,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare Albert Model transformer outputing raw hidden-states without any specific head on top.", "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.",
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertModel(TFAlbertPreTrainedModel): class TFAlbertModel(TFAlbertPreTrainedModel):
......
...@@ -364,14 +364,14 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r""" ...@@ -364,14 +364,14 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
model_args (additional positional arguments, `optional`): model_args (additional positional arguments, `optional`):
Will be passed along to the underlying model ``__init__()`` method. Will be passed along to the underlying model ``__init__()`` method.
config (:class:`~transformers.PretrainedConfig`, `optional`): config (:class:`~transformers.PretrainedConfig`, `optional`):
Configuration for the model to use instead of an automatically loaded configuation. Configuration can Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when: be automatically loaded when:
- The model is a model provided by the library (loaded with the `shortcut name` string of a - The model is a model provided by the library (loaded with the `shortcut name` string of a
pretrained model). pretrained model).
- The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
by suppling the save directory. by suppyling the save directory.
- The model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a - The model is loaded by suppyling a local directory as ``pretrained_model_name_or_path`` and a
configuration JSON file named `config.json` is found in the directory. configuration JSON file named `config.json` is found in the directory.
state_dict (`Dict[str, torch.Tensor]`, `optional`): state_dict (`Dict[str, torch.Tensor]`, `optional`):
A state dictionary to use instead of a state dictionary loaded from saved weights file. A state dictionary to use instead of a state dictionary loaded from saved weights file.
...@@ -398,7 +398,7 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r""" ...@@ -398,7 +398,7 @@ TF_AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`): output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`): local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model). Whether or not to only look at local files (e.g., not try downloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`): use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB. our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
...@@ -815,7 +815,7 @@ class TFAutoModelForMaskedLM: ...@@ -815,7 +815,7 @@ class TFAutoModelForMaskedLM:
This is a generic model class that will be instantiated as one of the model classes of the library---with a masked This is a generic model class that will be instantiated as one of the model classes of the library---with a masked
language modeling head---when created with the when created with the language modeling head---when created with the when created with the
:meth:`~transformers.TFAutoModelForMaskedLM.from_pretrained` class method or the :meth:`~transformers.TFAutoModelForMaskedLM.from_pretrained` class method or the
:meth:`~transformers.TFAutoModelForMasedLM.from_config` class method. :meth:`~transformers.TFAutoModelForMaskedLM.from_config` class method.
This class cannot be instantiated directly using ``__init__()`` (throws an error). This class cannot be instantiated directly using ``__init__()`` (throws an error).
""" """
...@@ -1297,7 +1297,7 @@ class TFAutoModelForTokenClassification: ...@@ -1297,7 +1297,7 @@ class TFAutoModelForTokenClassification:
class TFAutoModelForMultipleChoice: class TFAutoModelForMultipleChoice:
r""" r"""
This is a generic model class that will be instantiated as one of the model classes of the library---with a This is a generic model class that will be instantiated as one of the model classes of the library---with a
multiple choice classifcation head---when created with the when created with the multiple choice classification head---when created with the when created with the
:meth:`~transformers.TFAutoModelForMultipleChoice.from_pretrained` class method or the :meth:`~transformers.TFAutoModelForMultipleChoice.from_pretrained` class method or the
:meth:`~transformers.TFAutoModelForMultipleChoice.from_config` class method. :meth:`~transformers.TFAutoModelForMultipleChoice.from_config` class method.
......
...@@ -332,7 +332,7 @@ class TFBartEncoder(tf.keras.layers.Layer): ...@@ -332,7 +332,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
- **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
- **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch,
embed_dim)`. Only populated if *return_all_hiddens* is True. embed_dim)`. Only populated if *output_hidden_states* is True.
- **all_attentions** (List[Tensor]): Attention weights for each layer. - **all_attentions** (List[Tensor]): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout. During training might not be of length n_layers because of layer dropout.
""" """
......
...@@ -784,7 +784,7 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -784,7 +784,7 @@ BERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertModel(TFBertPreTrainedModel): class TFBertModel(TFBertPreTrainedModel):
......
...@@ -346,7 +346,7 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -346,7 +346,7 @@ class TFTransformer(tf.keras.layers.Layer):
Returns: Returns:
hidden_state: tf.Tensor(bs, seq_length, dim) hidden_state: tf.Tensor(bs, seq_length, dim)
Sequence of hiddens states in the last (top) layer Sequence of hidden states in the last (top) layer
all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)] all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
Tuple of length n_layers with the hidden states from each layer. Tuple of length n_layers with the hidden states from each layer.
Optional: only if output_hidden_states=True Optional: only if output_hidden_states=True
...@@ -552,7 +552,7 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -552,7 +552,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
- 1 indicates the head is **not masked**, - 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**. - 0 indicates the head is **masked**.
iinputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): inputs_embeds (:obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
vectors than the model's internal embedding lookup matrix. vectors than the model's internal embedding lookup matrix.
...@@ -571,7 +571,7 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -571,7 +571,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertModel(TFDistilBertPreTrainedModel): class TFDistilBertModel(TFDistilBertPreTrainedModel):
......
...@@ -109,7 +109,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -109,7 +109,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
languages ids which can be obtained from the language names by using two conversion mappings provided in languages ids which can be obtained from the language names by using two conversion mappings provided in
the configuration of the model (only provided for multilingual models). More precisely, the `language name the configuration of the model (only provided for multilingual models). More precisely, the `language name
to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary string to int) and the
`language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string). `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`. See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
...@@ -128,7 +128,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -128,7 +128,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
`What are position IDs? <../glossary.html#position-ids>`__ `What are position IDs? <../glossary.html#position-ids>`__
lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`): lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
Length of each sentence that can be used to avoid performing attention on padding token indices. You can Length of each sentence that can be used to avoid performing attention on padding token indices. You can
also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in also use `attention_mask` for the same result (see above), kept here for compatibility Indices selected in
``[0, ..., input_ids.size(-1)]``: ``[0, ..., input_ids.size(-1)]``:
cache (:obj:`Dict[str, tf.Tensor]`, `optional`): cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
Dictionary string to ``tf.FloatTensor`` that contains precomputed hidden states (key and values in the Dictionary string to ``tf.FloatTensor`` that contains precomputed hidden states (key and values in the
...@@ -214,7 +214,7 @@ class TFFlaubertPreTrainedModel(TFPreTrainedModel): ...@@ -214,7 +214,7 @@ class TFFlaubertPreTrainedModel(TFPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
"The bare Flaubert Model transformer outputing raw hidden-states without any specific head on top.", "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
FLAUBERT_START_DOCSTRING, FLAUBERT_START_DOCSTRING,
) )
class TFFlaubertModel(TFFlaubertPreTrainedModel): class TFFlaubertModel(TFFlaubertPreTrainedModel):
......
...@@ -178,7 +178,7 @@ class TFFunnelAttentionStructure: ...@@ -178,7 +178,7 @@ class TFFunnelAttentionStructure:
self.sin_dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.sin_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
self.cos_dropout = tf.keras.layers.Dropout(config.hidden_dropout) self.cos_dropout = tf.keras.layers.Dropout(config.hidden_dropout)
# Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
# dividide. # divided.
self.pooling_mult = None self.pooling_mult = None
def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False): def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None, training=False):
...@@ -219,7 +219,7 @@ class TFFunnelAttentionStructure: ...@@ -219,7 +219,7 @@ class TFFunnelAttentionStructure:
""" """
if self.attention_type == "factorized": if self.attention_type == "factorized":
# Notations from the paper, appending A.2.2, final formula. # Notations from the paper, appending A.2.2, final formula.
# We need to create and return the matrics phi, psi, pi and omega. # We need to create and return the matrices phi, psi, pi and omega.
pos_seq = tf.range(0, seq_len, 1.0, dtype=dtype) pos_seq = tf.range(0, seq_len, 1.0, dtype=dtype)
freq_seq = tf.range(0, self.d_model // 2, 1.0, dtype=dtype) freq_seq = tf.range(0, self.d_model // 2, 1.0, dtype=dtype)
inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2))) inv_freq = 1 / (10000 ** (freq_seq / (self.d_model // 2)))
......
...@@ -549,7 +549,7 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -549,7 +549,7 @@ GPT2_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2Model(TFGPT2PreTrainedModel): class TFGPT2Model(TFGPT2PreTrainedModel):
......
...@@ -172,9 +172,9 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): ...@@ -172,9 +172,9 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
incremental_indicies = tf.math.cumsum(mask, axis=1) * mask incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indicies + self.padding_idx return incremental_indices + self.padding_idx
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def create_position_ids_from_inputs_embeds(self, inputs_embeds):
""" """
...@@ -560,7 +560,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -560,7 +560,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
# which is the attention weights from tokens with global attention to all tokens # which is the attention weights from tokens with global attention to all tokens
# It doesn't not return local attention # It doesn't not return local attention
# In case of variable number of global attantion in the rows of a batch, # In case of variable number of global attention in the rows of a batch,
# attn_probs are padded with -10000.0 attention scores # attn_probs are padded with -10000.0 attention scores
# LOCAL ATTN: # LOCAL ATTN:
# without global attention, return local attention probabilities # without global attention, return local attention probabilities
...@@ -618,7 +618,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -618,7 +618,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
chunked_query = self._chunk(query, window_overlap) chunked_query = self._chunk(query, window_overlap)
chunked_key = self._chunk(key, window_overlap) chunked_key = self._chunk(key, window_overlap)
# matrix multipication # matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
# bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap # bcxy: batch_size * num_heads x chunks x 2window_overlap x 2window_overlap
...@@ -826,7 +826,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -826,7 +826,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
-0.7584, 0.4206, -0.0405, 0.1599, -0.7584, 0.4206, -0.0405, 0.1599,
2.0514, -1.1600, 0.5372, 0.2629 ] 2.0514, -1.1600, 0.5372, 0.2629 ]
window_overlap = num_rows = 4 window_overlap = num_rows = 4
(pad & diagonilize) => (pad & diagonalize) =>
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000 [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
...@@ -853,7 +853,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -853,7 +853,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
@staticmethod @staticmethod
def _chunk(hidden_states, window_overlap): def _chunk(hidden_states, window_overlap):
"""convert into overlapping chunkings. Chunk size = 2w, overlap size = w""" """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
batch_size, seq_length, hidden_dim = shape_list(hidden_states) batch_size, seq_length, hidden_dim = shape_list(hidden_states)
num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1 num_output_chunks = 2 * (seq_length // (2 * window_overlap)) - 1
...@@ -1557,7 +1557,7 @@ LONGFORMER_INPUTS_DOCSTRING = r""" ...@@ -1557,7 +1557,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): global_attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
Mask to decide the attention given on each token, local attention or global attenion. Tokens with global Mask to decide the attention given on each token, local attention or global attention. Tokens with global
attention attends to all other tokens, and all other tokens attend to them. This is important for attention attends to all other tokens, and all other tokens attend to them. This is important for
task-specific finetuning because it makes the model more flexible at representing the task. For example, task-specific finetuning because it makes the model more flexible at representing the task. For example,
for classification, the <s> token should be given global attention. For QA, all question tokens should also for classification, the <s> token should be given global attention. For QA, all question tokens should also
......
...@@ -50,7 +50,7 @@ TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -50,7 +50,7 @@ TF_LXMERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@dataclass @dataclass
class TFLxmertModelOutput(ModelOutput): class TFLxmertModelOutput(ModelOutput):
""" """
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language, Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship" visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
encoder") encoder")
...@@ -423,7 +423,7 @@ class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer): ...@@ -423,7 +423,7 @@ class TFLxmertSelfAttentionLayer(tf.keras.layers.Layer):
self.attention_output = TFLxmertAttentionOutput(config, name="output") self.attention_output = TFLxmertAttentionOutput(config, name="output")
def call(self, input_tensor, attention_mask, output_attentions, training=False): def call(self, input_tensor, attention_mask, output_attentions, training=False):
# Self attention attends to itself, thus keys and querys are the same (input_tensor). # Self attention attends to itself, thus keys and queries are the same (input_tensor).
self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions) self_output = self.self(input_tensor, input_tensor, attention_mask, output_attentions)
if output_attentions: if output_attentions:
attention_probs = self_output[1] attention_probs = self_output[1]
...@@ -868,7 +868,7 @@ LXMERT_START_DOCSTRING = r""" ...@@ -868,7 +868,7 @@ LXMERT_START_DOCSTRING = r"""
<https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model, <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome, pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
question answering attribute prediction, and object tag predicition. question answering attribute prediction, and object tag prediction.
This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
...@@ -962,7 +962,7 @@ LXMERT_INPUTS_DOCSTRING = r""" ...@@ -962,7 +962,7 @@ LXMERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare Lxmert Model transformer outputing raw hidden-states without any specific head on top.", "The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.",
LXMERT_START_DOCSTRING, LXMERT_START_DOCSTRING,
) )
class TFLxmertModel(TFLxmertPreTrainedModel): class TFLxmertModel(TFLxmertPreTrainedModel):
......
...@@ -952,7 +952,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" ...@@ -952,7 +952,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare MobileBert Model transformer outputing raw hidden-states without any specific head on top.", "The bare MobileBert Model transformer outputting raw hidden-states without any specific head on top.",
MOBILEBERT_START_DOCSTRING, MOBILEBERT_START_DOCSTRING,
) )
class TFMobileBertModel(TFMobileBertPreTrainedModel): class TFMobileBertModel(TFMobileBertPreTrainedModel):
......
...@@ -487,7 +487,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -487,7 +487,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
......
...@@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="") ...@@ -39,7 +39,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
return tuple with: return tuple with:
- pytorch model weight name - pytorch model weight name
- transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each - transpose: boolean indicating wether TF2.0 and PyTorch weights matrices are transposed with regards to each
other other
""" """
tf_name = tf_name.replace(":0", "") # device ids tf_name = tf_name.replace(":0", "") # device ids
...@@ -270,7 +270,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs ...@@ -270,7 +270,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path)) logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
# Instantiate and load the associated TF 2.0 model # Instantiate and load the associated TF 2.0 model
tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beggining tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beginning
tf_model_class = getattr(transformers, tf_model_class_name) tf_model_class = getattr(transformers, tf_model_class_name)
tf_model = tf_model_class(pt_model.config) tf_model = tf_model_class(pt_model.config)
......
...@@ -118,9 +118,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): ...@@ -118,9 +118,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
Returns: tf.Tensor Returns: tf.Tensor
""" """
mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32) mask = tf.cast(tf.math.not_equal(x, self.padding_idx), dtype=tf.int32)
incremental_indicies = tf.math.cumsum(mask, axis=1) * mask incremental_indices = tf.math.cumsum(mask, axis=1) * mask
return incremental_indicies + self.padding_idx return incremental_indices + self.padding_idx
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def create_position_ids_from_inputs_embeds(self, inputs_embeds):
""" """
...@@ -709,7 +709,7 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -709,7 +709,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaModel(TFRobertaPreTrainedModel): class TFRobertaModel(TFRobertaPreTrainedModel):
......
...@@ -71,7 +71,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -71,7 +71,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFT5LayerNorm(tf.keras.layers.Layer): class TFT5LayerNorm(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-6, **kwargs): def __init__(self, epsilon=1e-6, **kwargs):
""" """
Construct a layernorm module in the T5 style No bias and no substraction of mean. Construct a layernorm module in the T5 style No bias and no subtraction of mean.
""" """
super().__init__(**kwargs) super().__init__(**kwargs)
self.variance_epsilon = epsilon self.variance_epsilon = epsilon
...@@ -170,7 +170,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -170,7 +170,7 @@ class TFT5Attention(tf.keras.layers.Layer):
relative_position: an int32 Tensor relative_position: an int32 Tensor
bidirectional: a boolean - whether the attention is bidirectional bidirectional: a boolean - whether the attention is bidirectional
num_buckets: an integer num_buckets: an integer
max_distance: an intege max_distance: an integer
Returns: Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets) a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
...@@ -682,8 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -682,8 +682,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
if self.is_decoder and encoder_attention_mask is not None: if self.is_decoder and encoder_attention_mask is not None:
# If a 2D ou 3D attention mask is provided for the cross-attention # If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, mask_seq_length, mask_seq_length] # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32) encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
if num_dims_encoder_attention_mask == 3: if num_dims_encoder_attention_mask == 3:
...@@ -894,7 +894,7 @@ T5_INPUTS_DOCSTRING = r""" ...@@ -894,7 +894,7 @@ T5_INPUTS_DOCSTRING = r"""
sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
the decoder. the decoder.
past_key_values (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): past_key_values (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
ontains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
......
...@@ -800,7 +800,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -800,7 +800,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_START_DOCSTRING,
) )
class TFTransfoXLModel(TFTransfoXLPreTrainedModel): class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
......
...@@ -145,7 +145,7 @@ class TFCausalLanguageModelingLoss: ...@@ -145,7 +145,7 @@ class TFCausalLanguageModelingLoss:
class TFQuestionAnsweringLoss: class TFQuestionAnsweringLoss:
""" """
Loss function suitable for quetion answering. Loss function suitable for question answering.
""" """
def compute_loss(self, labels, logits): def compute_loss(self, labels, logits):
...@@ -807,7 +807,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -807,7 +807,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
Args: Args:
vocab_size (:obj:`int`): vocab_size (:obj:`int`):
The size of the vocabular, e.g., the number of unique tokens. The size of the vocabulary, e.g., the number of unique tokens.
hidden_size (:obj:`int`): hidden_size (:obj:`int`):
The size of the embedding vectors. The size of the embedding vectors.
initializer_range (:obj:`float`, `optional`): initializer_range (:obj:`float`, `optional`):
...@@ -860,7 +860,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -860,7 +860,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
:obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape :obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
:obj:`[batch_size, length, embedding_size]`. :obj:`[batch_size, length, embedding_size]`.
In linear mode, the ouput is a float32 with shape :obj:`[batch_size, length, vocab_size]`. In linear mode, the output is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
Raises: Raises:
ValueError: if :obj:`mode` is not valid. ValueError: if :obj:`mode` is not valid.
...@@ -1043,7 +1043,7 @@ def get_initializer(initializer_range: float = 0.02) -> tf.initializers.Truncate ...@@ -1043,7 +1043,7 @@ def get_initializer(initializer_range: float = 0.02) -> tf.initializers.Truncate
def cast_bool_to_primitive(bool_variable: Union[tf.Tensor, bool], default_tensor_to_true=False) -> bool: def cast_bool_to_primitive(bool_variable: Union[tf.Tensor, bool], default_tensor_to_true=False) -> bool:
""" """
Function arguments can be inserted as boolean tensor and bool variables to cope with Keras serialization we need to Function arguments can be inserted as boolean tensor and bool variables to cope with Keras serialization we need to
cast the bool argumnets (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor. cast the bool arguments (like :obj:`output_attentions` for instance) to correct boolean if it is a tensor.
Args: Args:
bool_variable (:obj:`Union[tf.Tensor, bool]`): bool_variable (:obj:`Union[tf.Tensor, bool]`):
......
...@@ -654,7 +654,7 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -654,7 +654,7 @@ XLM_INPUTS_DOCSTRING = r"""
`What are position IDs? <../glossary.html#position-ids>`__ `What are position IDs? <../glossary.html#position-ids>`__
lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`): lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`):
Length of each sentence that can be used to avoid performing attention on padding token indices. You can Length of each sentence that can be used to avoid performing attention on padding token indices. You can
also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
``[0, ..., input_ids.size(-1)]``. ``[0, ..., input_ids.size(-1)]``.
cache (:obj:`Dict[str, tf.Tensor]`, `optional`): cache (:obj:`Dict[str, tf.Tensor]`, `optional`):
Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
...@@ -688,7 +688,7 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -688,7 +688,7 @@ XLM_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMModel(TFXLMPreTrainedModel): class TFXLMModel(TFXLMPreTrainedModel):
......
...@@ -652,7 +652,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -652,7 +652,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
# data mask: input mask & perm mask # data mask: input mask & perm mask
assert input_mask is None or attention_mask is None, ( assert input_mask is None or attention_mask is None, (
"You can only use one of input_mask (uses 1 for padding) " "You can only use one of input_mask (uses 1 for padding) "
"or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
) )
if input_mask is None and attention_mask is not None: if input_mask is None and attention_mask is not None:
input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float) input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
...@@ -1122,7 +1122,7 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -1122,7 +1122,7 @@ XLNET_INPUTS_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetModel(TFXLNetPreTrainedModel): class TFXLNetModel(TFXLNetPreTrainedModel):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment