Unverified Commit edfd82f5 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Change model outputs types to self-document outputs (#5438)

* [WIP] Proposal for model outputs

* All Bert models

* Make CI green maybe?

* Fix ONNX test

* Isolate ModelOutput from pt and tf

* Formatting

* Add Electra models

* Auto-generate docstrings from outputs

* Add TF outputs

* Add some BERT models

* Revert TF side

* Remove last traces of TF changes

* Fail with a clear error message

* Add Albert and work through Bart

* Add CTRL and DistilBert

* Formatting

* Progress on Bart

* Renames and finish Bart

* Formatting

* Fix last test

* Add DPR

* Finish Electra and add FlauBERT

* Add GPT2

* Add Longformer

* Add MMBT

* Add MobileBert

* Add GPT

* Formatting

* Add Reformer

* Add Roberta

* Add T5

* Add Transformer XL

* Fix test

* Add XLM + fix XLMForTokenClassification

* Style + XLMRoberta

* Add XLNet

* Formatting

* Add doc of return_tuple arg
parent fa265230
...@@ -36,11 +36,13 @@ from .file_utils import ( ...@@ -36,11 +36,13 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_outputs import BaseModelOutput, CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "ReformerConfig"
_TOKENIZER_FOR_DOC = "ReformerTokenizer" _TOKENIZER_FOR_DOC = "ReformerTokenizer"
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -1493,6 +1495,10 @@ REFORMER_INPUTS_DOCSTRING = r""" ...@@ -1493,6 +1495,10 @@ REFORMER_INPUTS_DOCSTRING = r"""
For more information, see `num_hashes` in :class:`transformers.ReformerConfig`. For more information, see `num_hashes` in :class:`transformers.ReformerConfig`.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -1528,7 +1534,12 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1528,7 +1534,12 @@ class ReformerModel(ReformerPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment",
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1539,29 +1550,13 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1539,29 +1550,13 @@ class ReformerModel(ReformerPreTrainedModel):
num_hashes=None, num_hashes=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -1628,13 +1623,12 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1628,13 +1623,12 @@ class ReformerModel(ReformerPreTrainedModel):
if must_pad_to_match_chunk_length: if must_pad_to_match_chunk_length:
sequence_output = sequence_output[:, :orig_sequence_length] sequence_output = sequence_output[:, :orig_sequence_length]
outputs = (sequence_output,) hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
# TODO(PVP): Replace by named tuple after namedtuples are introduced in the library. attentions = encoder_outputs.all_attentions if output_attentions else None
if output_hidden_states is True:
outputs = outputs + (encoder_outputs.all_hidden_states,) if return_tuple:
if output_attentions is True: return tuple(v for v in [sequence_output, hidden_states, attentions] if v is not None)
outputs = outputs + (encoder_outputs.all_attentions,) return BaseModelOutput(last_hidden_state=sequence_output, hidden_states=hidden_states, attentions=attentions)
return outputs
def _pad_to_mult_of_chunk_length( def _pad_to_mult_of_chunk_length(
self, self,
...@@ -1712,7 +1706,12 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1712,7 +1706,12 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
pass pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment",
output_type=CausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1724,6 +1723,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1724,6 +1723,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
labels=None, labels=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1731,25 +1731,8 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1731,25 +1731,8 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -1760,12 +1743,13 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1760,12 +1743,13 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
num_hashes=num_hashes, num_hashes=num_hashes,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
logits = self.lm_head(sequence_output) logits = self.lm_head(sequence_output)
outputs = (logits,) + reformer_outputs[1:]
loss = None
if labels is not None: if labels is not None:
# Shift so that tokens < n predict n # Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous() shift_logits = logits[..., :-1, :].contiguous()
...@@ -1773,8 +1757,17 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1773,8 +1757,17 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
# Flatten the tokens # Flatten the tokens
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
outputs = (loss,) + outputs
return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) if return_tuple:
output = (logits,) + reformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return CausalLMOutput(
loss=loss,
logits=logits,
hidden_states=reformer_outputs.hidden_states,
attentions=reformer_outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, past, **kwargs): def prepare_inputs_for_generation(self, input_ids, past, **kwargs):
# TODO(PVP): Add smart caching # TODO(PVP): Add smart caching
...@@ -1806,7 +1799,12 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -1806,7 +1799,12 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
pass pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment",
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1818,31 +1816,15 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -1818,31 +1816,15 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
labels=None, labels=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -1853,18 +1835,27 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -1853,18 +1835,27 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
num_hashes=num_hashes, num_hashes=num_hashes,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
logits = self.lm_head(sequence_output) logits = self.lm_head(sequence_output)
outputs = (logits,) + reformer_outputs[1:]
masked_lm_loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
return outputs # (mlm_loss), lm_logits, (hidden_states), (attentions) if return_tuple:
output = (logits,) + reformer_outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=logits,
hidden_states=reformer_outputs.hidden_states,
attentions=reformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1889,7 +1880,12 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1889,7 +1880,12 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
pass pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1902,6 +1898,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1902,6 +1898,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
end_positions=None, end_positions=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1912,26 +1909,8 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1912,26 +1909,8 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ReformerConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -1942,6 +1921,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1942,6 +1921,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
num_hashes=num_hashes, num_hashes=num_hashes,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
...@@ -1951,8 +1931,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1951,8 +1931,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
start_logits = start_logits.squeeze(-1) start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1) end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + reformer_outputs[1:] total_loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension # If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1: if len(start_positions.size()) > 1:
...@@ -1968,6 +1947,15 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -1968,6 +1947,15 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
start_loss = loss_fct(start_logits, start_positions) start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if return_tuple:
output = (start_logits, end_logits) + reformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=reformer_outputs.hidden_states,
attentions=reformer_outputs.attentions,
)
...@@ -26,10 +26,18 @@ from torch.nn import CrossEntropyLoss, MSELoss ...@@ -26,10 +26,18 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
from .modeling_outputs import (
MaskedLMOutput,
MultipleChoiceModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "RobertaConfig"
_TOKENIZER_FOR_DOC = "RobertaTokenizer" _TOKENIZER_FOR_DOC = "RobertaTokenizer"
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -133,6 +141,10 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -133,6 +141,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -179,7 +191,12 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -179,7 +191,12 @@ class RobertaForMaskedLM(BertPreTrainedModel):
return self.lm_head.decoder return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -191,6 +208,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -191,6 +208,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -201,24 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -201,24 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -227,6 +227,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -227,6 +227,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
) )
labels = kwargs.pop("masked_lm_labels") labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -237,18 +238,26 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -237,18 +238,26 @@ class RobertaForMaskedLM(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here masked_lm_loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) if return_tuple:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return MaskedLMOutput(
loss=masked_lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class RobertaLMHead(nn.Module): class RobertaLMHead(nn.Module):
...@@ -295,7 +304,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -295,7 +304,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -307,6 +321,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -307,6 +321,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -314,25 +329,9 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -314,25 +329,9 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -342,11 +341,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -342,11 +341,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] loss = None
if labels is not None: if labels is not None:
if self.num_labels == 1: if self.num_labels == 1:
# We are doing regression # We are doing regression
...@@ -355,9 +355,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -355,9 +355,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
else: else:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions) if return_tuple:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -379,7 +384,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -379,7 +384,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=MultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -391,33 +401,15 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -391,33 +401,15 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
...@@ -439,6 +431,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -439,6 +431,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=flat_inputs_embeds, inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -446,14 +439,18 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -446,14 +439,18 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices) reshaped_logits = logits.view(-1, num_choices)
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if return_tuple:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -476,7 +473,12 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -476,7 +473,12 @@ class RobertaForTokenClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -488,30 +490,14 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -488,30 +490,14 @@ class RobertaForTokenClassification(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -522,6 +508,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -522,6 +508,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -529,8 +516,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -529,8 +516,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
sequence_output = self.dropout(sequence_output) sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss # Only keep active parts of the loss
...@@ -543,9 +529,14 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -543,9 +529,14 @@ class RobertaForTokenClassification(BertPreTrainedModel):
loss = loss_fct(active_logits, active_labels) loss = loss_fct(active_logits, active_labels)
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), scores, (hidden_states), (attentions) if return_tuple:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class RobertaClassificationHead(nn.Module): class RobertaClassificationHead(nn.Module):
...@@ -586,7 +577,12 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -586,7 +577,12 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -599,6 +595,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -599,6 +595,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -609,27 +606,8 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -609,27 +606,8 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -640,6 +618,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -640,6 +618,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -649,7 +628,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -649,7 +628,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
start_logits = start_logits.squeeze(-1) start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1) end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + outputs[2:] total_loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension # If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1: if len(start_positions.size()) > 1:
...@@ -665,9 +644,18 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -665,9 +644,18 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
start_loss = loss_fct(start_logits, start_positions) start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if return_tuple:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
def create_position_ids_from_input_ids(input_ids, padding_idx): def create_position_ids_from_input_ids(input_ids, padding_idx):
......
...@@ -27,12 +27,20 @@ from torch import nn ...@@ -27,12 +27,20 @@ from torch import nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from .configuration_t5 import T5Config from .configuration_t5 import T5Config
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, Seq2SeqLMOutput, Seq2SeqModelOutput
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "T5Config"
_TOKENIZER_FOR_DOC = "T5Tokenizer" _TOKENIZER_FOR_DOC = "T5Tokenizer"
#################################################### ####################################################
...@@ -667,6 +675,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -667,6 +675,7 @@ class T5Stack(T5PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
...@@ -674,6 +683,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -674,6 +683,7 @@ class T5Stack(T5PreTrainedModel):
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -704,6 +714,9 @@ class T5Stack(T5PreTrainedModel): ...@@ -704,6 +714,9 @@ class T5Stack(T5PreTrainedModel):
else: else:
mask_seq_length = seq_length mask_seq_length = seq_length
if use_cache is True:
assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
if attention_mask is None: if attention_mask is None:
attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device) attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None: if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
...@@ -726,9 +739,9 @@ class T5Stack(T5PreTrainedModel): ...@@ -726,9 +739,9 @@ class T5Stack(T5PreTrainedModel):
# Prepare head mask if needed # Prepare head mask if needed
head_mask = self.get_head_mask(head_mask, self.config.num_layers) head_mask = self.get_head_mask(head_mask, self.config.num_layers)
present_key_value_states = () present_key_value_states = () if use_cache else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
all_attentions = () all_attentions = () if output_attentions else None
position_bias = None position_bias = None
encoder_decoder_position_bias = None encoder_decoder_position_bias = None
...@@ -761,7 +774,8 @@ class T5Stack(T5PreTrainedModel): ...@@ -761,7 +774,8 @@ class T5Stack(T5PreTrainedModel):
if self.is_decoder and encoder_hidden_states is not None: if self.is_decoder and encoder_hidden_states is not None:
encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 3] encoder_decoder_position_bias = layer_outputs[5 if output_attentions else 3]
# append next layer key value states # append next layer key value states
present_key_value_states = present_key_value_states + (present_key_value_state,) if use_cache:
present_key_value_states = present_key_value_states + (present_key_value_state,)
if output_attentions: if output_attentions:
all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now all_attentions = all_attentions + (layer_outputs[2],) # We keep only self-attention weights for now
...@@ -773,15 +787,18 @@ class T5Stack(T5PreTrainedModel): ...@@ -773,15 +787,18 @@ class T5Stack(T5PreTrainedModel):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) if return_tuple:
if use_cache is True: return tuple(
assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) v
outputs = outputs + (present_key_value_states,) for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions]
if output_hidden_states: if v is not None
outputs = outputs + (all_hidden_states,) )
if output_attentions: return BaseModelOutputWithPast(
outputs = outputs + (all_attentions,) last_hidden_state=hidden_states,
return outputs # last-layer hidden state, (presents,) (all hidden states), (all attentions) past_key_values=present_key_value_states,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
T5_START_DOCSTRING = r""" T5_START_DOCSTRING = r"""
...@@ -849,6 +866,10 @@ T5_INPUTS_DOCSTRING = r""" ...@@ -849,6 +866,10 @@ T5_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -894,6 +915,7 @@ class T5Model(T5PreTrainedModel): ...@@ -894,6 +915,7 @@ class T5Model(T5PreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -908,42 +930,25 @@ class T5Model(T5PreTrainedModel): ...@@ -908,42 +930,25 @@ class T5Model(T5PreTrainedModel):
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example:: Example::
>>> from transformers import T5Tokenizer, T5Model >>> from transformers import T5Tokenizer, T5Model
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5Model.from_pretrained('t5-small') >>> model = T5Model.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids) >>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -954,6 +959,13 @@ class T5Model(T5PreTrainedModel): ...@@ -954,6 +959,13 @@ class T5Model(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
)
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
...@@ -984,13 +996,24 @@ class T5Model(T5PreTrainedModel): ...@@ -984,13 +996,24 @@ class T5Model(T5PreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
if use_cache is True: past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
past = ((encoder_outputs, decoder_outputs[1]),) if return_tuple:
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
return decoder_outputs + encoder_outputs return decoder_outputs + encoder_outputs
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
...@@ -1031,6 +1054,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1031,6 +1054,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1046,6 +1070,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1046,6 +1070,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -1058,27 +1083,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1058,27 +1083,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
If `past_key_value_states` is used only the last prediction_scores of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_value_states (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -1105,6 +1109,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1105,6 +1109,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -1116,6 +1121,13 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1116,6 +1121,13 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
)
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
...@@ -1145,28 +1157,38 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1145,28 +1157,38 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
# insert decoder past at right place
# to speed up decoding
if use_cache is True:
past = ((encoder_outputs, decoder_outputs[1]),)
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
sequence_output = decoder_outputs[0] sequence_output = decoder_outputs[0]
# Rescale output before projecting on vocab # Rescale output before projecting on vocab
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
sequence_output = sequence_output * (self.model_dim ** -0.5) sequence_output = sequence_output * (self.model_dim ** -0.5)
lm_logits = self.lm_head(sequence_output) lm_logits = self.lm_head(sequence_output)
decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-100) loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
decoder_outputs = (loss,) + decoder_outputs
return decoder_outputs + encoder_outputs past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
if return_tuple:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
return ((loss,) + output) if loss is not None else output
return Seq2SeqLMOutput(
loss=loss,
logits=lm_logits,
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs): def prepare_inputs_for_generation(self, input_ids, past, attention_mask, use_cache, **kwargs):
assert past is not None, "past has to be defined for encoder_outputs" assert past is not None, "past has to be defined for encoder_outputs"
......
...@@ -20,20 +20,22 @@ ...@@ -20,20 +20,22 @@
import logging import logging
from typing import Optional from dataclasses import dataclass
from typing import List, Optional, Tuple
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
from .modeling_utils import PreTrainedModel from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "TransfoXLConfig"
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer" _TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -590,6 +592,73 @@ class TransfoXLPreTrainedModel(PreTrainedModel): ...@@ -590,6 +592,73 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
return embeddings.cutoffs return embeddings.cutoffs
@dataclass
class TransfoXLModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor
mems: List[torch.FloatTensor]
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class TransfoXLLMHeadModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
Language modeling loss (for next-token prediction).
losses (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided)
Language modeling losses (not reduced).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
losses: Optional[torch.FloatTensor]
prediction_scores: torch.FloatTensor
mems: List[torch.FloatTensor]
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
TRANSFO_XL_START_DOCSTRING = r""" TRANSFO_XL_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
...@@ -626,6 +695,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -626,6 +695,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -751,7 +824,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -751,7 +824,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems return new_mems
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TransfoXLModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -760,32 +838,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -760,32 +838,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
...@@ -841,7 +900,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -841,7 +900,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
] ]
hids = [] hids = []
attentions = [] attentions = [] if output_attentions else None
if self.attn_type == 0: # default if self.attn_type == 0: # default
pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype)
if self.clamp_len > 0: if self.clamp_len > 0:
...@@ -872,19 +931,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -872,19 +931,24 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
new_mems = self._update_mems(hids, mems, mlen, qlen) new_mems = self._update_mems(hids, mems, mlen, qlen)
# We transpose back here to shape [bsz, len, hidden_dim]
outputs = [core_out.transpose(0, 1).contiguous(), new_mems]
if output_hidden_states: if output_hidden_states:
# Add last layer and transpose to library standard shape [bsz, len, hidden_dim] # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
hids.append(core_out) hids.append(core_out)
hids = list(t.transpose(0, 1).contiguous() for t in hids) hids = tuple(t.transpose(0, 1).contiguous() for t in hids)
outputs.append(hids) else:
hids = None
if output_attentions: if output_attentions:
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions) attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs.append(attentions) # We transpose back here to shape [bsz, len, hidden_dim]
core_out = core_out.transpose(0, 1).contiguous()
if return_tuple:
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) return TransfoXLModelOutput(
last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -936,7 +1000,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -936,7 +1000,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz) return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TransfoXLLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -946,6 +1015,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -946,6 +1015,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -954,29 +1024,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -954,29 +1024,8 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided)
Language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
if input_ids is not None: if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1) bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None: elif inputs_embeds is not None:
...@@ -991,6 +1040,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -991,6 +1040,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
...@@ -998,14 +1048,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -998,14 +1048,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
outputs = transformer_outputs[1:] outputs = transformer_outputs[1:]
softmax_output = self.crit(pred_hid, labels) softmax_output = self.crit(pred_hid, labels)
if labels is None: prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
softmax_output = softmax_output.view(bsz, tgt_len, -1) loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None
outputs = [softmax_output] + outputs
else: if return_tuple:
softmax_output = softmax_output.view(bsz, tgt_len - 1) output = (prediction_scores,) + outputs[1:]
outputs = [softmax_output, None] + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions) return TransfoXLLMHeadModelOutput(
losses=loss,
prediction_scores=prediction_scores,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def get_output_embeddings(self): def get_output_embeddings(self):
""" Double-check if you are using adaptive softmax. """ Double-check if you are using adaptive softmax.
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import inspect import inspect
import logging import logging
import os import os
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional, Tuple from typing import Callable, Dict, List, Optional, Tuple
import torch import torch
...@@ -31,6 +32,7 @@ from .file_utils import ( ...@@ -31,6 +32,7 @@ from .file_utils import (
TF2_WEIGHTS_NAME, TF2_WEIGHTS_NAME,
TF_WEIGHTS_NAME, TF_WEIGHTS_NAME,
WEIGHTS_NAME, WEIGHTS_NAME,
ModelOutput,
cached_path, cached_path,
hf_bucket_url, hf_bucket_url,
is_remote_url, is_remote_url,
...@@ -941,6 +943,35 @@ class PoolerAnswerClass(nn.Module): ...@@ -941,6 +943,35 @@ class PoolerAnswerClass(nn.Module):
return x return x
@dataclass
class SquadHeadOutput(ModelOutput):
"""
Base class for outputs of question answering models using a :obj:`SquadHead`.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
"""
loss: Optional[torch.FloatTensor] = None
start_top_log_probs: Optional[torch.FloatTensor] = None
start_top_index: Optional[torch.LongTensor] = None
end_top_log_probs: Optional[torch.FloatTensor] = None
end_top_index: Optional[torch.LongTensor] = None
cls_logits: Optional[torch.FloatTensor] = None
class SQuADHead(nn.Module): class SQuADHead(nn.Module):
r""" A SQuAD head inspired by XLNet. r""" A SQuAD head inspired by XLNet.
...@@ -992,10 +1023,15 @@ class SQuADHead(nn.Module): ...@@ -992,10 +1023,15 @@ class SQuADHead(nn.Module):
self.answer_class = PoolerAnswerClass(config) self.answer_class = PoolerAnswerClass(config)
def forward( def forward(
self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, self,
hidden_states,
start_positions=None,
end_positions=None,
cls_index=None,
is_impossible=None,
p_mask=None,
return_tuple=False,
): ):
outputs = ()
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
...@@ -1021,7 +1057,7 @@ class SQuADHead(nn.Module): ...@@ -1021,7 +1057,7 @@ class SQuADHead(nn.Module):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5 total_loss += cls_loss * 0.5
outputs = (total_loss,) + outputs return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss)
else: else:
# during inference, compute the end logits based on beam search # during inference, compute the end logits based on beam search
...@@ -1051,11 +1087,16 @@ class SQuADHead(nn.Module): ...@@ -1051,11 +1087,16 @@ class SQuADHead(nn.Module):
start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs if return_tuple:
return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
# return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits else:
# or (if labels are provided) (total_loss,) return SquadHeadOutput(
return outputs start_top_log_probs=start_top_log_probs,
start_top_index=start_top_index,
end_top_log_probs=end_top_log_probs,
end_top_index=end_top_index,
cls_logits=cls_logits,
)
class SequenceSummary(nn.Module): class SequenceSummary(nn.Module):
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
import itertools import itertools
import logging import logging
import math import math
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import torch import torch
...@@ -28,7 +30,20 @@ from torch.nn import functional as F ...@@ -28,7 +30,20 @@ from torch.nn import functional as F
from .activations import gelu from .activations import gelu
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_outputs import (
BaseModelOutput,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
)
from .modeling_utils import ( from .modeling_utils import (
PreTrainedModel, PreTrainedModel,
SequenceSummary, SequenceSummary,
...@@ -40,6 +55,7 @@ from .modeling_utils import ( ...@@ -40,6 +55,7 @@ from .modeling_utils import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLMConfig"
_TOKENIZER_FOR_DOC = "XLMTokenizer" _TOKENIZER_FOR_DOC = "XLMTokenizer"
XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -240,6 +256,47 @@ class XLMPreTrainedModel(PreTrainedModel): ...@@ -240,6 +256,47 @@ class XLMPreTrainedModel(PreTrainedModel):
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
@dataclass
class XLMForQuestionAnsweringOutput(ModelOutput):
"""
Base class for outputs of question answering models using a :obj:`SquadHead`.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
start_top_log_probs: Optional[torch.FloatTensor] = None
start_top_index: Optional[torch.LongTensor] = None
end_top_log_probs: Optional[torch.FloatTensor] = None
end_top_index: Optional[torch.LongTensor] = None
cls_logits: Optional[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
XLM_START_DOCSTRING = r""" XLM_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
...@@ -306,6 +363,10 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -306,6 +363,10 @@ XLM_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -397,7 +458,12 @@ class XLMModel(XLMPreTrainedModel): ...@@ -397,7 +458,12 @@ class XLMModel(XLMPreTrainedModel):
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -411,28 +477,13 @@ class XLMModel(XLMPreTrainedModel): ...@@ -411,28 +477,13 @@ class XLMModel(XLMPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
if input_ids is not None: if input_ids is not None:
bs, slen = input_ids.size() bs, slen = input_ids.size()
...@@ -502,8 +553,8 @@ class XLMModel(XLMPreTrainedModel): ...@@ -502,8 +553,8 @@ class XLMModel(XLMPreTrainedModel):
tensor *= mask.unsqueeze(-1).to(tensor.dtype) tensor *= mask.unsqueeze(-1).to(tensor.dtype)
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
if output_hidden_states: if output_hidden_states:
hidden_states = hidden_states + (tensor,) hidden_states = hidden_states + (tensor,)
...@@ -542,12 +593,9 @@ class XLMModel(XLMPreTrainedModel): ...@@ -542,12 +593,9 @@ class XLMModel(XLMPreTrainedModel):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if return_tuple:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
class XLMPredLayer(nn.Module): class XLMPredLayer(nn.Module):
...@@ -623,7 +671,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -623,7 +671,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
return {"input_ids": input_ids, "langs": langs} return {"input_ids": input_ids, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=MaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -638,6 +691,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -638,6 +691,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -646,25 +700,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -646,25 +700,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -677,13 +715,21 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -677,13 +715,21 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided.
outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs if return_tuple:
return outputs + transformer_outputs[1:]
return MaskedLMOutput(
loss=outputs[0] if labels is not None else None,
logits=outputs[0] if labels is None else outputs[1],
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -702,7 +748,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -702,7 +748,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=SequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -717,6 +768,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -717,6 +768,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -724,25 +776,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -724,25 +776,9 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -755,13 +791,13 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -755,13 +791,13 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here loss = None
if labels is not None: if labels is not None:
if self.num_labels == 1: if self.num_labels == 1:
# We are doing regression # We are doing regression
...@@ -770,9 +806,17 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -770,9 +806,17 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
else: else:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs if return_tuple:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -790,7 +834,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -790,7 +834,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=QuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -806,6 +855,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -806,6 +855,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -816,27 +866,9 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -816,27 +866,9 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -849,6 +881,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -849,6 +881,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -858,10 +891,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -858,10 +891,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
start_logits = start_logits.squeeze(-1) start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1) end_logits = end_logits.squeeze(-1)
outputs = ( total_loss = None
start_logits,
end_logits,
)
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension # If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1: if len(start_positions.size()) > 1:
...@@ -877,11 +907,18 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -877,11 +907,18 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
start_loss = loss_fct(start_logits, start_positions) start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here if return_tuple:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return outputs return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -899,6 +936,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -899,6 +936,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=XLMForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -917,6 +955,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -917,6 +955,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
p_mask=None, p_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -936,30 +975,6 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -936,30 +975,6 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
1.0 means token should be masked. 0.0 mean token is not masked. 1.0 means token should be masked. 0.0 mean token is not masked.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example:: Example::
...@@ -976,6 +991,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -976,6 +991,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0] >>> loss = outputs[0]
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -988,6 +1005,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -988,6 +1005,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -999,11 +1017,22 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -999,11 +1017,22 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
cls_index=cls_index, cls_index=cls_index,
is_impossible=is_impossible, is_impossible=is_impossible,
p_mask=p_mask, p_mask=p_mask,
return_tuple=return_tuple,
) )
outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here if return_tuple:
return outputs + transformer_outputs[1:]
return outputs
return XLMForQuestionAnsweringOutput(
loss=outputs.loss,
start_top_log_probs=outputs.start_top_log_probs,
start_top_index=outputs.start_top_index,
end_top_log_probs=outputs.end_top_log_probs,
end_top_index=outputs.end_top_index,
cls_logits=outputs.cls_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1023,7 +1052,12 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1023,7 +1052,12 @@ class XLMForTokenClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1034,33 +1068,19 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1034,33 +1068,19 @@ class XLMForTokenClassification(XLMPreTrainedModel):
lengths=None, lengths=None,
cache=None, cache=None,
head_mask=None, head_mask=None,
inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1070,8 +1090,10 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1070,8 +1090,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1079,7 +1101,7 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1079,7 +1101,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
sequence_output = self.dropout(sequence_output) sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss # Only keep active parts of the loss
...@@ -1092,6 +1114,11 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1092,6 +1114,11 @@ class XLMForTokenClassification(XLMPreTrainedModel):
loss = loss_fct(active_logits, active_labels) loss = loss_fct(active_logits, active_labels)
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), scores, (hidden_states), (attentions) if return_tuple:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
...@@ -55,6 +55,10 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -55,6 +55,10 @@ XLM_ROBERTA_START_DOCSTRING = r"""
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
...@@ -26,12 +28,19 @@ from torch.nn import functional as F ...@@ -26,12 +28,19 @@ from torch.nn import functional as F
from .activations import gelu_new, swish from .activations import gelu_new, swish
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLNetConfig"
_TOKENIZER_FOR_DOC = "XLNetTokenizer" _TOKENIZER_FOR_DOC = "XLNetTokenizer"
XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -554,6 +563,264 @@ class XLNetPreTrainedModel(PreTrainedModel): ...@@ -554,6 +563,264 @@ class XLNetPreTrainedModel(PreTrainedModel):
module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
@dataclass
class XLNetModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.XLNetModel`.
Args:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetLMHeadModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.XLNetModel`.
Args:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss (for next-token prediction).
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetForSequenceClassificationOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetForTokenClassificationOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetForMultipleChoiceOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
"""
Base class for outputs of question answering models.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor]
start_logits: torch.FloatTensor
end_logits: torch.FloatTensor
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput):
"""
Base class for outputs of question answering models using a :obj:`SquadHead`.
Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[torch.FloatTensor] = None
start_top_log_probs: Optional[torch.FloatTensor] = None
start_top_index: Optional[torch.LongTensor] = None
end_top_log_probs: Optional[torch.FloatTensor] = None
end_top_index: Optional[torch.LongTensor] = None
cls_logits: Optional[torch.FloatTensor] = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
XLNET_START_DOCSTRING = r""" XLNET_START_DOCSTRING = r"""
This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
...@@ -622,6 +889,10 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -622,6 +889,10 @@ XLNET_INPUTS_DOCSTRING = r"""
If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
...@@ -751,7 +1022,12 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -751,7 +1022,12 @@ class XLNetModel(XLNetPreTrainedModel):
return pos_emb return pos_emb
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=XLNetModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -766,33 +1042,13 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -766,33 +1042,13 @@ class XLNetModel(XLNetPreTrainedModel):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
...@@ -920,8 +1176,8 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -920,8 +1176,8 @@ class XLNetModel(XLNetPreTrainedModel):
if mems is None: if mems is None:
mems = [None] * len(self.layer) mems = [None] * len(self.layer)
attentions = [] attentions = [] if output_attentions else None
hidden_states = [] hidden_states = [] if output_hidden_states else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.mem_len is not None and self.mem_len > 0 and use_cache is True: if self.mem_len is not None and self.mem_len > 0 and use_cache is True:
# cache new mems # cache new mems
...@@ -952,17 +1208,18 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -952,17 +1208,18 @@ class XLNetModel(XLNetPreTrainedModel):
output = self.dropout(output_g if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs = (output.permute(1, 0, 2).contiguous(),) output = output.permute(1, 0, 2).contiguous()
if self.mem_len is not None and self.mem_len > 0 and use_cache is True: # TODO Teven: fix this test to only use use_cache.
outputs = outputs + (new_mems,) if not (self.mem_len is not None and self.mem_len > 0 and use_cache is True):
new_mems = None
if output_hidden_states: if output_hidden_states:
if output_g is not None: if output_g is not None:
hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs) hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
else: else:
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states) hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
outputs = outputs + (hidden_states,)
if output_attentions: if output_attentions:
if target_mapping is not None: if target_mapping is not None:
# when target_mapping is provided, there are 2-tuple of attentions # when target_mapping is provided, there are 2-tuple of attentions
...@@ -971,9 +1228,13 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -971,9 +1228,13 @@ class XLNetModel(XLNetPreTrainedModel):
) )
else: else:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, (new_mems), (hidden_states), (attentions) if return_tuple:
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
return XLNetModelOutput(
last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1029,6 +1290,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1029,6 +1290,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
return inputs return inputs
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=XLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1040,10 +1302,11 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1040,10 +1302,11 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
input_mask=None, input_mask=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None, return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
...@@ -1055,27 +1318,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1055,27 +1318,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
`num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -1108,6 +1350,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1108,6 +1350,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
loss, next_token_logits = outputs[:2] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] loss, next_token_logits = outputs[:2] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1121,19 +1365,28 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1121,19 +1365,28 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
logits = self.lm_loss(transformer_outputs[0]) logits = self.lm_loss(transformer_outputs[0])
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None
if labels is not None: if labels is not None:
# Flatten the tokens # Flatten the tokens
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) if return_tuple:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return XLNetLMHeadModelOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1153,7 +1406,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1153,7 +1406,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=XLNetForSequenceClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1165,10 +1423,11 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1165,10 +1423,11 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
input_mask=None, input_mask=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True,
labels=None, labels=None,
use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
...@@ -1176,29 +1435,9 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1176,29 +1435,9 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1212,14 +1451,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1212,14 +1451,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
logits = self.logits_proj(output) logits = self.logits_proj(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None
if labels is not None: if labels is not None:
if self.num_labels == 1: if self.num_labels == 1:
# We are doing regression # We are doing regression
...@@ -1228,9 +1467,18 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1228,9 +1467,18 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
else: else:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) if return_tuple:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return XLNetForSequenceClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1249,7 +1497,12 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1249,7 +1497,12 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=XLNetForTokenClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1261,39 +1514,19 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1261,39 +1514,19 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
input_mask=None, input_mask=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True,
labels=None, labels=None,
use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`torch.FloatTensor` of shape :obj:(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -1308,13 +1541,14 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1308,13 +1541,14 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss # Only keep active parts of the loss
...@@ -1327,9 +1561,18 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1327,9 +1561,18 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
loss = loss_fct(active_logits, active_labels) loss = loss_fct(active_logits, active_labels)
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) if return_tuple:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return XLNetForTokenClassificationOutput(
loss=loss,
logits=logits,
mems=outputs.mems,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1348,7 +1591,12 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1348,7 +1591,12 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=XLNetForMultipleChoiceOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1360,41 +1608,19 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1360,41 +1608,19 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
target_mapping=None, target_mapping=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True,
labels=None, labels=None,
use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
...@@ -1420,6 +1646,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1420,6 +1646,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1427,16 +1654,23 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1427,16 +1654,23 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
output = self.sequence_summary(output) output = self.sequence_summary(output)
logits = self.logits_proj(output) logits = self.logits_proj(output)
reshaped_logits = logits.view(-1, num_choices) reshaped_logits = logits.view(-1, num_choices)
outputs = (reshaped_logits,) + transformer_outputs[
1:
] # Keep mems, hidden states, attentions if there are in it
loss = None
if labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels.view(-1)) loss = loss_fct(reshaped_logits, labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions) if return_tuple:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return XLNetForMultipleChoiceOutput(
loss=loss,
logits=reshaped_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1455,7 +1689,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1455,7 +1689,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=XLNetForQuestionAnsweringSimpleOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1467,11 +1706,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1467,11 +1706,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
input_mask=None, input_mask=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1482,31 +1722,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1482,31 +1722,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -1521,6 +1738,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1521,6 +1738,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1530,7 +1748,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1530,7 +1748,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
start_logits = start_logits.squeeze(-1) start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1) end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + outputs[2:] total_loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension # If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1: if len(start_positions.size()) > 1:
...@@ -1546,9 +1764,19 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1546,9 +1764,19 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
start_loss = loss_fct(start_logits, start_positions) start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) if return_tuple:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return XLNetForQuestionAnsweringSimpleOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
mems=outputs.mems,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1570,6 +1798,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1570,6 +1798,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=XLNetForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1581,14 +1810,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1581,14 +1810,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
input_mask=None, input_mask=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
is_impossible=None, is_impossible=None,
cls_index=None, cls_index=None,
p_mask=None, p_mask=None,
use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1608,50 +1838,24 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1608,50 +1838,24 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
1.0 means token should be masked. 0.0 mean token is not masked. 1.0 means token should be masked. 0.0 mean token is not masked.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top config.start_n_top start token possibilities (beam-search).
start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top config.start_n_top start token possibilities (beam-search).
end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
Log probabilities for the ``is_impossible`` label of the answers.
mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example:: Example::
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch >>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) >>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs[0] >>> loss = outputs[0]
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1665,6 +1869,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1665,6 +1869,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
...@@ -1694,7 +1899,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1694,7 +1899,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5 total_loss += cls_loss * 0.5
outputs = (total_loss,) + outputs if return_tuple:
return (total_loss,) + transformer_outputs[1:]
else:
return XLNetForQuestionAnsweringOutput(
loss=total_loss,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
else: else:
# during inference, compute the end logits based on beam search # during inference, compute the end logits based on beam search
...@@ -1728,8 +1941,17 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1728,8 +1941,17 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
hidden_states, start_states=start_states, cls_index=cls_index hidden_states, start_states=start_states, cls_index=cls_index
) # Shape (batch size,): one single `cls_logits` for each sample ) # Shape (batch size,): one single `cls_logits` for each sample
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs if return_tuple:
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
# return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits return outputs + transformer_outputs[1:]
# or (if labels are provided) (total_loss,) else:
return outputs return XLNetForQuestionAnsweringOutput(
start_top_log_probs=start_top_log_probs,
start_top_index=start_top_index,
end_top_log_probs=end_top_log_probs,
end_top_index=end_top_index,
cls_logits=cls_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
...@@ -220,7 +220,6 @@ class ModelTesterMixin: ...@@ -220,7 +220,6 @@ class ModelTesterMixin:
def test_torchscript(self): def test_torchscript(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
self._create_and_check_torchscript(config, inputs_dict) self._create_and_check_torchscript(config, inputs_dict)
def test_torchscript_output_attentions(self): def test_torchscript_output_attentions(self):
...@@ -230,7 +229,6 @@ class ModelTesterMixin: ...@@ -230,7 +229,6 @@ class ModelTesterMixin:
def test_torchscript_output_hidden_state(self): def test_torchscript_output_hidden_state(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.output_hidden_states = True config.output_hidden_states = True
self._create_and_check_torchscript(config, inputs_dict) self._create_and_check_torchscript(config, inputs_dict)
......
...@@ -355,6 +355,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -355,6 +355,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
import tempfile import tempfile
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
config_and_inputs[0].return_tuple = True
model = T5Model(config_and_inputs[0]) model = T5Model(config_and_inputs[0])
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
torch.onnx.export( torch.onnx.export(
......
...@@ -319,7 +319,7 @@ class TFModelTesterMixin: ...@@ -319,7 +319,7 @@ class TFModelTesterMixin:
outputs_dict = model(input_ids) outputs_dict = model(input_ids)
hidden_states = outputs_dict[0] hidden_states = outputs_dict[0]
# Add a dense layer on top to test intetgration with other keras modules # Add a dense layer on top to test integration with other keras modules
outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
# Compile extended model # Compile extended model
......
...@@ -347,6 +347,7 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -347,6 +347,7 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase):
XLMForQuestionAnswering, XLMForQuestionAnswering,
XLMForSequenceClassification, XLMForSequenceClassification,
XLMForQuestionAnsweringSimple, XLMForQuestionAnsweringSimple,
XLMForTokenClassification,
) )
if is_torch_available() if is_torch_available()
else () else ()
......
...@@ -35,6 +35,7 @@ if is_torch_available(): ...@@ -35,6 +35,7 @@ if is_torch_available():
XLNetForSequenceClassification, XLNetForSequenceClassification,
XLNetForTokenClassification, XLNetForTokenClassification,
XLNetForQuestionAnswering, XLNetForQuestionAnswering,
XLNetForQuestionAnsweringSimple,
) )
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_LIST
...@@ -458,6 +459,7 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -458,6 +459,7 @@ class XLNetModelTest(ModelTesterMixin, unittest.TestCase):
XLNetForTokenClassification, XLNetForTokenClassification,
XLNetForSequenceClassification, XLNetForSequenceClassification,
XLNetForQuestionAnswering, XLNetForQuestionAnswering,
XLNetForQuestionAnsweringSimple,
XLNetForMultipleChoice, XLNetForMultipleChoice,
) )
if is_torch_available() if is_torch_available()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment