Unverified Commit 969859d5 authored by Santiago Castro's avatar Santiago Castro Committed by GitHub
Browse files

Fix doc errors and typos across the board (#8139)

* Fix doc errors and typos across the board

* Fix a typo

* Fix the CI

* Fix more typos

* Fix CI

* More fixes

* Fix CI

* More fixes

* More fixes
parent 4731a00c
...@@ -25,14 +25,14 @@ logger = logging.get_logger(__name__) ...@@ -25,14 +25,14 @@ logger = logging.get_logger(__name__)
class TFGenerationMixin: class TFGenerationMixin:
""" """
A class contraining all of the functions supporting generation, to be used as a mixin in A class containing all of the functions supporting generation, to be used as a mixin in
:class:`~transfomers.TFPreTrainedModel`. :class:`~transformers.TFPreTrainedModel`.
""" """
def prepare_inputs_for_generation(self, inputs, **kwargs): def prepare_inputs_for_generation(self, inputs, **kwargs):
""" """
Implement in subclasses of :class:`~transfomers.TFPreTrainedModel` for custom behavior to prepare inputs in the Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
generate method. the generate method.
""" """
return {"inputs": inputs} return {"inputs": inputs}
...@@ -216,17 +216,17 @@ class TFGenerationMixin: ...@@ -216,17 +216,17 @@ class TFGenerationMixin:
) )
if input_ids is not None: if input_ids is not None:
batch_size = shape_list(input_ids)[0] # overriden by the input batch_size batch_size = shape_list(input_ids)[0] # overridden by the input batch_size
else: else:
batch_size = 1 batch_size = 1
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
assert temperature > 0, "`temperature` should be strictely positive." assert temperature > 0, "`temperature` should be strictly positive."
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
...@@ -239,10 +239,10 @@ class TFGenerationMixin: ...@@ -239,10 +239,10 @@ class TFGenerationMixin:
assert (eos_token_id is None) or ( assert (eos_token_id is None) or (
isinstance(eos_token_id, int) and (eos_token_id >= 0) isinstance(eos_token_id, int) and (eos_token_id >= 0)
), "`eos_token_id` should be a positive integer." ), "`eos_token_id` should be a positive integer."
assert length_penalty > 0, "`length_penalty` should be strictely positive." assert length_penalty > 0, "`length_penalty` should be strictly positive."
assert ( assert (
isinstance(num_return_sequences, int) and num_return_sequences > 0 isinstance(num_return_sequences, int) and num_return_sequences > 0
), "`num_return_sequences` should be a strictely positive integer." ), "`num_return_sequences` should be a strictly positive integer."
assert ( assert (
bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
...@@ -722,7 +722,7 @@ class TFGenerationMixin: ...@@ -722,7 +722,7 @@ class TFGenerationMixin:
beam_scores[:, None], (batch_size * num_beams, vocab_size) beam_scores[:, None], (batch_size * num_beams, vocab_size)
) # (batch_size * num_beams, vocab_size) ) # (batch_size * num_beams, vocab_size)
# re-organize to group the beam together (we are keeping top hypothesis accross beams) # re-organize to group the beam together (we are keeping top hypothesis across beams)
next_scores = tf.reshape( next_scores = tf.reshape(
next_scores, (batch_size, num_beams * vocab_size) next_scores, (batch_size, num_beams * vocab_size)
) # (batch_size, num_beams * vocab_size) ) # (batch_size, num_beams * vocab_size)
...@@ -897,7 +897,7 @@ class TFGenerationMixin: ...@@ -897,7 +897,7 @@ class TFGenerationMixin:
def adjust_logits_during_generation(self, logits, **kwargs): def adjust_logits_during_generation(self, logits, **kwargs):
""" """
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
the generate method. the generate method.
""" """
return logits return logits
...@@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): ...@@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
""" """
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args: Args:
logits: logits distribution shape (batch size, vocabulary size) logits: logits distribution shape (batch size, vocabulary size)
...@@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value): ...@@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
def sample_without_replacement(logits, num_samples): def sample_without_replacement(logits, num_samples):
""" """
categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
https://github.com/tensorflow/tensorflow/issues/9260 for more info https://github.com/tensorflow/tensorflow/issues/9260 for more info
""" """
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1)) z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
......
...@@ -29,20 +29,20 @@ logger = logging.get_logger(__name__) ...@@ -29,20 +29,20 @@ logger = logging.get_logger(__name__)
class GenerationMixin: class GenerationMixin:
""" """
A class contraining all of the functions supporting generation, to be used as a mixin in A class containing all of the functions supporting generation, to be used as a mixin in
:class:`~transfomers.PreTrainedModel`. :class:`~transformers.PreTrainedModel`.
""" """
def prepare_inputs_for_generation(self, input_ids, **kwargs): def prepare_inputs_for_generation(self, input_ids, **kwargs):
""" """
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
generate method. generate method.
""" """
return {"input_ids": input_ids} return {"input_ids": input_ids}
def adjust_logits_during_generation(self, logits, **kwargs): def adjust_logits_during_generation(self, logits, **kwargs):
""" """
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
the generate method. the generate method.
""" """
return logits return logits
...@@ -285,7 +285,7 @@ class GenerationMixin: ...@@ -285,7 +285,7 @@ class GenerationMixin:
) )
if input_ids is not None: if input_ids is not None:
batch_size = input_ids.shape[0] # overriden by the input batch_size batch_size = input_ids.shape[0] # overridden by the input batch_size
else: else:
batch_size = 1 batch_size = 1
...@@ -533,7 +533,7 @@ class GenerationMixin: ...@@ -533,7 +533,7 @@ class GenerationMixin:
): ):
""" """
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
independantly. independently.
""" """
# length of generated sentences / unfinished sentences # length of generated sentences / unfinished sentences
unfinished_sents = input_ids.new(batch_size).fill_(1) unfinished_sents = input_ids.new(batch_size).fill_(1)
...@@ -600,7 +600,7 @@ class GenerationMixin: ...@@ -600,7 +600,7 @@ class GenerationMixin:
# unfinished_sents is set to zero if eos in sentence # unfinished_sents is set to zero if eos in sentence
unfinished_sents.mul_((~eos_in_sents).long()) unfinished_sents.mul_((~eos_in_sents).long())
# stop when there is a </s> in each sentence, or if we exceed the maximul length # stop when there is a </s> in each sentence, or if we exceed the maximum length
if unfinished_sents.max() == 0: if unfinished_sents.max() == 0:
break break
...@@ -724,7 +724,7 @@ class GenerationMixin: ...@@ -724,7 +724,7 @@ class GenerationMixin:
else: else:
next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size)
# re-organize to group the beam together (we are keeping top hypothesis accross beams) # re-organize to group the beam together (we are keeping top hypothesis across beams)
next_scores = next_scores.view( next_scores = next_scores.view(
batch_size, num_beams * vocab_size batch_size, num_beams * vocab_size
) # (batch_size, num_beams * vocab_size) ) # (batch_size, num_beams * vocab_size)
...@@ -969,7 +969,7 @@ def top_k_top_p_filtering( ...@@ -969,7 +969,7 @@ def top_k_top_p_filtering(
min_tokens_to_keep: int = 1, min_tokens_to_keep: int = 1,
) -> Tensor: ) -> Tensor:
""" """
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args: Args:
logits: logits distribution shape (batch size, vocabulary size) logits: logits distribution shape (batch size, vocabulary size)
......
...@@ -49,7 +49,7 @@ class ModelCard: ...@@ -49,7 +49,7 @@ class ModelCard:
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):
# Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers) # Recommended attributes from https://arxiv.org/abs/1810.03993 (see papers)
self.model_details = kwargs.pop("model_details", {}) self.model_details = kwargs.pop("model_details", {})
self.intended_use = kwargs.pop("intended_use", {}) self.intended_use = kwargs.pop("intended_use", {})
self.factors = kwargs.pop("factors", {}) self.factors = kwargs.pop("factors", {})
......
...@@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r""" ...@@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
model_args (additional positional arguments, `optional`): model_args (additional positional arguments, `optional`):
Will be passed along to the underlying model ``__init__()`` method. Will be passed along to the underlying model ``__init__()`` method.
config (:class:`~transformers.PretrainedConfig`, `optional`): config (:class:`~transformers.PretrainedConfig`, `optional`):
Configuration for the model to use instead of an automatically loaded configuation. Configuration can Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when: be automatically loaded when:
- The model is a model provided by the library (loaded with the `shortcut name` string of a - The model is a model provided by the library (loaded with the `shortcut name` string of a
...@@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r""" ...@@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`): output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`): local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to only look at local files (e.g., not try doanloading the model). Whether or not to only look at local files (e.g., not try downloading the model).
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`): use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB. our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
...@@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification: ...@@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification:
class AutoModelForMultipleChoice: class AutoModelForMultipleChoice:
r""" r"""
This is a generic model class that will be instantiated as one of the model classes of the library---with a This is a generic model class that will be instantiated as one of the model classes of the library---with a
multiple choice classifcation head---when created with the when created with the multiple choice classification head---when created with the when created with the
:meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
:meth:`~transformers.AutoModelForMultipleChoice.from_config` class method. :meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.
......
...@@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel): ...@@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel):
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict, return_dict=return_dict,
) )
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput( encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0], last_hidden_state=encoder_outputs[0],
......
...@@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function): ...@@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function):
Args: Args:
input (:obj:`torch.tensor`): The input tensor that will apply softmax. input (:obj:`torch.tensor`): The input tensor that will apply softmax.
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation. mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
dim (int): The dimenssion that will apply softmax dim (int): The dimension that will apply softmax
Example:: Example::
import torch import torch
...@@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module): ...@@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module):
Args: Args:
hidden_states (:obj:`torch.FloatTensor`): hidden_states (:obj:`torch.FloatTensor`):
Input states to the module usally the output from previous layer, it will be the Q,K and V in Input states to the module usually the output from previous layer, it will be the Q,K and V in
`Attention(Q,K,V)` `Attention(Q,K,V)`
attention_mask (:obj:`torch.ByteTensor`): attention_mask (:obj:`torch.ByteTensor`):
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j` sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
th token. th token.
return_att (:obj:`bool`, optional): return_att (:obj:`bool`, optional):
Whether return the attention maxitrix. Whether return the attention matrix.
query_states (:obj:`torch.FloatTensor`, optional): query_states (:obj:`torch.FloatTensor`, optional):
The `Q` state in `Attention(Q,K,V)`. The `Q` state in `Attention(Q,K,V)`.
...@@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module): ...@@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module):
relative_pos = relative_pos.unsqueeze(1) relative_pos = relative_pos.unsqueeze(1)
# bxhxqxk # bxhxqxk
elif relative_pos.dim() != 4: elif relative_pos.dim() != 4:
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}") raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions) att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
relative_pos = relative_pos.long().to(query_layer.device) relative_pos = relative_pos.long().to(query_layer.device)
...@@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r""" ...@@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of <https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data. improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
......
...@@ -290,7 +290,7 @@ class Transformer(nn.Module): ...@@ -290,7 +290,7 @@ class Transformer(nn.Module):
attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
Returns: Returns:
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top) hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
Tuple of length n_layers with the hidden states from each layer. Tuple of length n_layers with the hidden states from each layer.
Optional: only if output_hidden_states=True Optional: only if output_hidden_states=True
......
...@@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r""" ...@@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r"""
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
tensors for more detail. tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`): output_hidden_states (:obj:`bool`, `optional`):
Whether or not to rturn the hidden states of all layers. See ``hidden_states`` under returned tensors for Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
more detail. more detail.
return_dict (:obj:`bool`, `optional`): return_dict (:obj:`bool`, `optional`):
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
......
...@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__) ...@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "EncoderDecoderConfig" _CONFIG_FOR_DOC = "EncoderDecoderConfig"
ENCODER_DECODER_START_DOCSTRING = r""" ENCODER_DECODER_START_DOCSTRING = r"""
This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the This class can be used to initialize a sequence-tsequencece model with any pretrained autoencoding model as the
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
:meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via :meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
:meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added :meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
......
...@@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
`What are position IDs? <../glossary.html#position-ids>`_ `What are position IDs? <../glossary.html#position-ids>`_
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Length of each sentence that can be used to avoid performing attention on padding token indices. You can Length of each sentence that can be used to avoid performing attention on padding token indices. You can
also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
selected in ``[0, ..., input_ids.size(-1)]``: selected in ``[0, ..., input_ids.size(-1)]``:
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`): cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
......
...@@ -124,18 +124,18 @@ class FlaxAutoModel(object): ...@@ -124,18 +124,18 @@ class FlaxAutoModel(object):
All remaining positional arguments will be passed to the underlying model's ``__init__`` method All remaining positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can Configuration for the model to use instead of an automatically loaded configuration. Configuration can
be automatically loaded when: be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
pretrained model), or pretrained model), or
- the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded - the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
by suppling the save directory. by supplying the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a - the model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
configuration JSON file named `config.json` is found in the directory. configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved an optional state dictionary for the model to use instead of a state dictionary loaded from saved
weights file. This option can be used if you want to create a model from a pretrained configuration but weights file. This option can be used if you want to create a model from a pretrained configuration but
load your own weights. In this case though, you should check if using load your own weights. In this case though, you should check if using
:func:`~transformers.FlaxPreTrainedModel.save_pretrained` and :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
...@@ -150,14 +150,14 @@ class FlaxAutoModel(object): ...@@ -150,14 +150,14 @@ class FlaxAutoModel(object):
they exists. they exists.
resume_download: (`optional`) boolean, default False: resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
output_loading_info: (`optional`) boolean: output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error
messages. messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments: kwargs: (`optional`) Remaining dictionary of keyword arguments:
......
...@@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**, - 1 for tokens that are **not masked**,
- 0 for tokens that are **maked**. - 0 for tokens that are **masked**.
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
......
...@@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module): ...@@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module):
d_model = self.config.d_model d_model = self.config.d_model
if self.config.attention_type == "factorized": if self.config.attention_type == "factorized":
# Notations from the paper, appending A.2.2, final formula. # Notations from the paper, appending A.2.2, final formula.
# We need to create and return the matrics phi, psi, pi and omega. # We need to create and return the matrices phi, psi, pi and omega.
pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device) pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device) freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2))) inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
...@@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel): ...@@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
@add_start_docstrings( @add_start_docstrings(
""" """
Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
first timestep of the last hidden state) e.g. for GLUE tasks. first timestep of the last hidden state) e.g. for GLUE tasks.
""", """,
FUNNEL_START_DOCSTRING, FUNNEL_START_DOCSTRING,
......
...@@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel):
attention_mask = (1.0 - attention_mask) * -10000.0 attention_mask = (1.0 - attention_mask) * -10000.0
# If a 2D ou 3D attention mask is provided for the cross-attention # If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.add_cross_attention and encoder_hidden_states is not None: if self.config.add_cross_attention and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
...@@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
position_ids = kwargs.get("position_ids", None) position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None: if attention_mask is not None and position_ids is None:
# create postion_ids on the fly for batch generation # create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1 position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1) position_ids.masked_fill_(attention_mask == 0, 1)
if past: if past:
...@@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel): ...@@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
sequence_lengths = -1 sequence_lengths = -1
logger.warning( logger.warning(
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`" f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
) )
pooled_logits = logits[range(batch_size), sequence_lengths] pooled_logits = logits[range(batch_size), sequence_lengths]
......
...@@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module): ...@@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module):
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
# which is the attention weights from tokens with global attention to all tokens # which is the attention weights from tokens with global attention to all tokens
# It doesn't not return local attention # It doesn't not return local attention
# In case of variable number of global attantion in the rows of a batch, # In case of variable number of global attention in the rows of a batch,
# attn_probs are padded with -10000.0 attention scores # attn_probs are padded with -10000.0 attention scores
attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len) attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
else: else:
...@@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module): ...@@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module):
-0.7584, 0.4206, -0.0405, 0.1599, -0.7584, 0.4206, -0.0405, 0.1599,
2.0514, -1.1600, 0.5372, 0.2629 ] 2.0514, -1.1600, 0.5372, 0.2629 ]
window_overlap = num_rows = 4 window_overlap = num_rows = 4
(pad & diagonilize) => (pad & diagonalize) =>
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000 [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
...@@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module): ...@@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module):
@staticmethod @staticmethod
def _chunk(hidden_states, window_overlap): def _chunk(hidden_states, window_overlap):
"""convert into overlapping chunkings. Chunk size = 2w, overlap size = w""" """convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
# non-overlapping chunks of size = 2w # non-overlapping chunks of size = 2w
hidden_states = hidden_states.view( hidden_states = hidden_states.view(
...@@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module): ...@@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module):
chunked_query = self._chunk(query, window_overlap) chunked_query = self._chunk(query, window_overlap)
chunked_key = self._chunk(key, window_overlap) chunked_key = self._chunk(key, window_overlap)
# matrix multipication # matrix multiplication
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim # bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
# bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap # bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
...@@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r""" ...@@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
Mask to decide the attention given on each token, local attention or global attenion. Tokens with global Mask to decide the attention given on each token, local attention or global attention. Tokens with global
attention attends to all other tokens, and all other tokens attend to them. This is important for attention attends to all other tokens, and all other tokens attend to them. This is important for
task-specific finetuning because it makes the model more flexible at representing the task. For example, task-specific finetuning because it makes the model more flexible at representing the task. For example,
for classification, the <s> token should be given global attention. For QA, all question tokens should also for classification, the <s> token should be given global attention. For QA, all question tokens should also
......
...@@ -58,7 +58,7 @@ class GeLU(nn.Module): ...@@ -58,7 +58,7 @@ class GeLU(nn.Module):
@dataclass @dataclass
class LxmertModelOutput(ModelOutput): class LxmertModelOutput(ModelOutput):
""" """
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language, Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship" visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
encoder") encoder")
...@@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module): ...@@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module):
self.output = LxmertAttentionOutput(config) self.output = LxmertAttentionOutput(config)
def forward(self, input_tensor, attention_mask, output_attentions=False): def forward(self, input_tensor, attention_mask, output_attentions=False):
# Self attention attends to itself, thus keys and querys are the same (input_tensor). # Self attention attends to itself, thus keys and queries are the same (input_tensor).
output = self.self( output = self.self(
input_tensor, input_tensor,
input_tensor, input_tensor,
...@@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r""" ...@@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r"""
<https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model, <https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome, pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
question answering attribute prediction, and object tag predicition. question answering attribute prediction, and object tag prediction.
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
...@@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel): ...@@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
will add newly initialized weights. Reducing the size will remove weights from the end will add newly initialized weights. Reducing the size will remove weights from the end
Args: Args:
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
Old linear layer to be resized.
num_labels (:obj:`int`, `optional`): num_labels (:obj:`int`, `optional`):
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`, weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
anything. anything.
Return: Return:
...@@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel): ...@@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
will add newly initialized weights. Reducing the size will remove weights from the end will add newly initialized weights. Reducing the size will remove weights from the end
Args: Args:
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
Old linear layer to be resized.
num_labels (:obj:`int`, `optional`): num_labels (:obj:`int`, `optional`):
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`, weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
anything. anything.
Return: Return:
......
...@@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
) )
# If a 2D ou 3D attention mask is provided for the cross-attention # If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder and encoder_hidden_states is not None: if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
......
...@@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput): ...@@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput):
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Languaged modeling loss. Language modeling loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
each vocabulary token. each vocabulary token.
...@@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r""" ...@@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
Used by the (:class:`~transformers.RagModel`) model during decoding. Used by the (:class:`~transformers.RagModel`) model during decoding.
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for generation tasks. `None` by default, constuct as per instructions for the generator model Provide for generation tasks. `None` by default, construct as per instructions for the generator model
you're using with your RAG instance. you're using with your RAG instance.
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
...@@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r""" ...@@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
:obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during :obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
decoding. decoding.
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`): doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
:obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever`` :obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
:obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via :obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
:obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more :obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
...@@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel): ...@@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel):
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
""" """
A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass. A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
""", """,
RAG_START_DOCSTRING, RAG_START_DOCSTRING,
) )
...@@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel): ...@@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
>>> input_ids = input_dict["input_ids"] >>> input_ids = input_dict["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"]) >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
>>> # or use retriever seperately >>> # or use retriever separately
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True) >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
>>> # 1. Encode >>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0] >>> question_hidden_states = model.question_encoder(input_ids)[0]
...@@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel): ...@@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel):
) # batch_size x n_docs x tgt_len x dim ) # batch_size x n_docs x tgt_len x dim
doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1) doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
# RAG-sequence marginaliation # RAG-sequence marginalization
first_token_scores = seq_logprobs[:, :, :1, :] first_token_scores = seq_logprobs[:, :, :1, :]
second_token_scores = seq_logprobs[:, :, 1:2, :] second_token_scores = seq_logprobs[:, :, 1:2, :]
remainder = seq_logprobs[:, :, 2:, :] remainder = seq_logprobs[:, :, 2:, :]
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2) rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
# calcualate loss # calculate loss
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1) target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
assert target.dim() == rag_logprobs.dim() assert target.dim() == rag_logprobs.dim()
...@@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel): ...@@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
@add_start_docstrings_to_model_forward( @add_start_docstrings_to_model_forward(
""" """
A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass. A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
""", """,
RAG_START_DOCSTRING, RAG_START_DOCSTRING,
) )
...@@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
>>> input_ids = input_dict["input_ids"] >>> input_ids = input_dict["input_ids"]
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"]) >>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
>>> # or use retriever seperately >>> # or use retriever separately
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True) >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
>>> # 1. Encode >>> # 1. Encode
>>> question_hidden_states = model.question_encoder(input_ids)[0] >>> question_hidden_states = model.question_encoder(input_ids)[0]
...@@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel): ...@@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
to the forward pass. :obj:`context_input_ids` are returned by to the forward pass. :obj:`context_input_ids` are returned by
:meth:`~transformers.RagRetriever.__call__`. :meth:`~transformers.RagRetriever.__call__`.
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`): doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
:obj:`question_encoder_last_hidden_state`. :obj:`question_encoder_last_hidden_state`.
If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
......
...@@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): ...@@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
class ReverseSort(Function): class ReverseSort(Function):
""" """
After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here. backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
""" """
@staticmethod @staticmethod
...@@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel):
device=device, device=device,
) )
# start index for postion encoding depends on incremental decoding # start index for position encoding depends on incremental decoding
if past_buckets_states is not None: if past_buckets_states is not None:
start_idx_pos_encodings = past_buckets_states[0][1].shape[1] start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
else: else:
......
...@@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r""" ...@@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"""Bert Based model to embed queries or document for document retreival. """, """Bert Based model to embed queries or document for document retrieval. """,
RETRIBERT_START_DOCSTRING, RETRIBERT_START_DOCSTRING,
) )
class RetriBertModel(RetriBertPreTrainedModel): class RetriBertModel(RetriBertPreTrainedModel):
...@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
attention_mask, input_shape, device attention_mask, input_shape, device
) )
# define function for cehckpointing # define function for checkpointing
def partial_encode(*inputs): def partial_encode(*inputs):
encoder_outputs = sent_encoder.encoder( encoder_outputs = sent_encoder.encoder(
inputs[0], inputs[0],
...@@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
Return: Return:
:obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to :obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
its corresponding document and each cocument to its corresponding query in the batch its corresponding document and each document to its corresponding query in the batch
""" """
device = input_ids_query.device device = input_ids_query.device
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size) q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment