Unverified Commit 91cb9546 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Switch from return_tuple to return_dict (#6138)



* Switch from return_tuple to return_dict

* Fix test

* [WIP] Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleC… (#5614)

* Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests

* AutoModels


Tiny tweaks

* Style

* Final changes before merge

* Re-order for simpler review

* Final fixes

* Addressing @sgugger's comments

* Test MultipleChoice

* Rework TF trainer (#6038)

* Fully rework training/prediction loops

* fix method name

* Fix variable name

* Fix property name

* Fix scope

* Fix method name

* Fix tuple index

* Fix tuple index

* Fix indentation

* Fix variable name

* fix eval before log

* Add drop remainder for test dataset

* Fix step number + fix logging datetime

* fix eval loss value

* use global step instead of step + fix logging at step 0

* Fix logging datetime

* Fix global_step usage

* Fix breaking loop + logging datetime

* Fix step in prediction loop

* Fix step breaking

* Fix train/test loops

* Force TF at least 2.2 for the trainer

* Use assert_cardinality to facilitate the dataset size computation

* Log steps per epoch

* Make tfds compliant with TPU

* Make tfds compliant with TPU

* Use TF dataset enumerate instead of the Python one

* revert previous commit

* Fix data_dir

* Apply style

* rebase on master

* Address Sylvain's comments

* Address Sylvain's and Lysandre comments

* Trigger CI

* Remove unused import

* Switch from return_tuple to return_dict

* Fix test

* Add recent model
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarJulien Plu <plu.julien@gmail.com>
parent 562b6369
...@@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput): ...@@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
heads. heads.
""" """
lm_loss: Optional[torch.FloatTensor] lm_loss: Optional[torch.FloatTensor] = None
mc_loss: Optional[torch.FloatTensor] mc_loss: Optional[torch.FloatTensor] = None
lm_logits: torch.FloatTensor lm_logits: torch.FloatTensor = None
mc_logits: torch.FloatTensor mc_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -374,8 +374,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -374,8 +374,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -425,13 +426,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -425,13 +426,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -496,7 +497,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -496,7 +497,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
if return_tuple: if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput( return BaseModelOutput(
...@@ -538,7 +539,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -538,7 +539,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -548,7 +549,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -548,7 +549,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -559,7 +560,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -559,7 +560,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
...@@ -573,7 +574,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -573,7 +574,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if return_tuple: if not return_dict:
output = (lm_logits,) + transformer_outputs[1:] output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -622,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -622,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
mc_labels=None, mc_labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -650,7 +651,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -650,7 +651,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
import torch import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
...@@ -662,7 +663,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -662,7 +663,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
lm_logits = outputs.lm_logits lm_logits = outputs.lm_logits
mc_logits = outputs.mc_logits mc_logits = outputs.mc_logits
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if "lm_labels" in kwargs: if "lm_labels" in kwargs:
warnings.warn( warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
...@@ -680,7 +681,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -680,7 +681,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -698,7 +699,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -698,7 +699,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if return_tuple: if not return_dict:
output = (lm_logits, mc_logits) + transformer_outputs[1:] output = (lm_logits, mc_logits) + transformer_outputs[1:]
if mc_loss is not None: if mc_loss is not None:
output = (mc_loss,) + output output = (mc_loss,) + output
......
...@@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput): ...@@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput):
""" """
last_hidden_state: torch.FloatTensor last_hidden_state: torch.FloatTensor
pooler_output: torch.FloatTensor pooler_output: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput): ...@@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput):
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput): ...@@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None past_key_values: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput): ...@@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput): ...@@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput):
self-attention heads. self-attention heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput): ...@@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput): ...@@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput): ...@@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput):
self-attention heads. self-attention heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput): ...@@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput): ...@@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput): ...@@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor end_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): ...@@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
self-attention heads. self-attention heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor end_logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
......
...@@ -39,13 +39,7 @@ from .file_utils import ( ...@@ -39,13 +39,7 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_outputs import ( from .modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
BaseModelOutput,
CausalLMOutput,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
)
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
...@@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput): ...@@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -1922,8 +1916,9 @@ REFORMER_INPUTS_DOCSTRING = r""" ...@@ -1922,8 +1916,9 @@ REFORMER_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -1962,7 +1957,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1962,7 +1957,7 @@ class ReformerModel(ReformerPreTrainedModel):
@add_code_sample_docstrings( @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC, tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment", checkpoint="google/reformer-crime-and-punishment",
output_type=BaseModelOutput, output_type=ReformerModelOutput,
config_class=_CONFIG_FOR_DOC, config_class=_CONFIG_FOR_DOC,
) )
def forward( def forward(
...@@ -1977,40 +1972,14 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1977,40 +1972,14 @@ class ReformerModel(ReformerPreTrainedModel):
use_cache=None, use_cache=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None, return_dict=None,
): ):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with :obj:`tuple(0)` being the previous `buckets` of shape
:obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
and :obj:`tuple(1)` being the previous `hidden_states` of shape
:obj:`(batch_size, sequence_length, hidden_size)`).
Contains pre-computed buckets and hidden-states that can be used (see
``past_buckets_states`` input) to speed up sequential decoding.
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -2102,7 +2071,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -2102,7 +2071,7 @@ class ReformerModel(ReformerPreTrainedModel):
hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
attentions = encoder_outputs.all_attentions if output_attentions else None attentions = encoder_outputs.all_attentions if output_attentions else None
if return_tuple: if not return_dict:
return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None) return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None)
return ReformerModelOutput( return ReformerModelOutput(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
...@@ -2208,7 +2177,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -2208,7 +2177,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
use_cache=None, use_cache=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None, return_dict=None,
labels=None, labels=None,
): ):
r""" r"""
...@@ -2218,7 +2187,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -2218,7 +2187,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -2231,7 +2200,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -2231,7 +2200,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
...@@ -2246,7 +2215,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -2246,7 +2215,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + reformer_outputs[1:] output = (logits,) + reformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -2326,7 +2295,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -2326,7 +2295,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
labels=None, labels=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -2334,7 +2303,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -2334,7 +2303,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -2346,7 +2315,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -2346,7 +2315,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
use_cache=False, # no causal mask use_cache=False, # no causal mask
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
...@@ -2357,7 +2326,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel): ...@@ -2357,7 +2326,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss() # -100 index = padding token loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + reformer_outputs[1:] output = (logits,) + reformer_outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
...@@ -2408,7 +2377,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel): ...@@ -2408,7 +2377,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
labels=None, labels=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -2427,7 +2396,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel): ...@@ -2427,7 +2396,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
num_hashes=num_hashes, num_hashes=num_hashes,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -2443,7 +2412,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel): ...@@ -2443,7 +2412,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -2511,7 +2480,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -2511,7 +2480,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
end_positions=None, end_positions=None,
output_hidden_states=None, output_hidden_states=None,
output_attentions=None, output_attentions=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -2523,7 +2492,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -2523,7 +2492,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
input_ids, input_ids,
...@@ -2535,7 +2504,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -2535,7 +2504,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
use_cache=False, # no causal mask use_cache=False, # no causal mask
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = reformer_outputs[0] sequence_output = reformer_outputs[0]
...@@ -2562,7 +2531,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel): ...@@ -2562,7 +2531,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if return_tuple: if not return_dict:
output = (start_logits, end_logits) + reformer_outputs[1:] output = (start_logits, end_logits) + reformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
......
...@@ -143,8 +143,9 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -143,8 +143,9 @@ ROBERTA_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -208,7 +209,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -208,7 +209,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -227,7 +228,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -227,7 +228,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
) )
labels = kwargs.pop("masked_lm_labels") labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -238,7 +239,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -238,7 +239,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
...@@ -248,7 +249,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -248,7 +249,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple: if not return_dict:
output = (prediction_scores,) + outputs[2:] output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
...@@ -321,7 +322,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -321,7 +322,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -330,7 +331,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -330,7 +331,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -341,7 +342,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -341,7 +342,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
...@@ -356,7 +357,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -356,7 +357,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -401,7 +402,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -401,7 +402,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -409,7 +410,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -409,7 +410,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
...@@ -431,7 +432,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -431,7 +432,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=flat_inputs_embeds, inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -444,7 +445,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -444,7 +445,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
if return_tuple: if not return_dict:
output = (reshaped_logits,) + outputs[2:] output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -490,14 +491,14 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -490,14 +491,14 @@ class RobertaForTokenClassification(BertPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -508,7 +509,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -508,7 +509,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -530,7 +531,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -530,7 +531,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -595,7 +596,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -595,7 +596,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -607,7 +608,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -607,7 +608,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -618,7 +619,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -618,7 +619,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -645,7 +646,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -645,7 +646,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if return_tuple: if not return_dict:
output = (start_logits, end_logits) + outputs[2:] output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
......
...@@ -675,7 +675,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -675,7 +675,7 @@ class T5Stack(T5PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
...@@ -683,7 +683,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -683,7 +683,7 @@ class T5Stack(T5PreTrainedModel):
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -787,7 +787,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -787,7 +787,7 @@ class T5Stack(T5PreTrainedModel):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
if return_tuple: if not return_dict:
return tuple( return tuple(
v v
for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions] for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions]
...@@ -868,8 +868,9 @@ T5_INPUTS_DOCSTRING = r""" ...@@ -868,8 +868,9 @@ T5_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -930,7 +931,7 @@ class T5Model(T5PreTrainedModel): ...@@ -930,7 +931,7 @@ class T5Model(T5PreTrainedModel):
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
**kwargs, **kwargs,
): ):
r""" r"""
...@@ -957,7 +958,7 @@ class T5Model(T5PreTrainedModel): ...@@ -957,7 +958,7 @@ class T5Model(T5PreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -968,9 +969,9 @@ class T5Model(T5PreTrainedModel): ...@@ -968,9 +969,9 @@ class T5Model(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput( encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0], last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
...@@ -1005,11 +1006,11 @@ class T5Model(T5PreTrainedModel): ...@@ -1005,11 +1006,11 @@ class T5Model(T5PreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
if return_tuple: if not return_dict:
if past is not None: if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
return decoder_outputs + encoder_outputs return decoder_outputs + encoder_outputs
...@@ -1081,7 +1082,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1081,7 +1082,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
**kwargs, **kwargs,
): ):
r""" r"""
...@@ -1100,13 +1101,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1100,13 +1101,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small') >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, labels=input_ids) >>> outputs = model(input_ids=input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2] >>> loss = outputs.loss
>>> logits = outputs.logits
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small') >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model.generate(input_ids) >>> outputs = model.generate(input_ids)
""" """
...@@ -1126,7 +1128,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1126,7 +1128,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -1138,9 +1140,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1138,9 +1140,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput( encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0], last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
...@@ -1174,7 +1176,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1174,7 +1176,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = decoder_outputs[0] sequence_output = decoder_outputs[0]
...@@ -1190,7 +1192,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1190,7 +1192,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
if return_tuple: if not return_dict:
if past is not None: if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
......
...@@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput): ...@@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput):
""" """
last_hidden_state: torch.FloatTensor last_hidden_state: torch.FloatTensor
mems: List[torch.FloatTensor] mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput): ...@@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput):
heads. heads.
""" """
losses: Optional[torch.FloatTensor] losses: Optional[torch.FloatTensor] = None
prediction_scores: torch.FloatTensor prediction_scores: torch.FloatTensor = None
mems: List[torch.FloatTensor] mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -695,8 +695,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -695,8 +695,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -836,13 +837,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -836,13 +837,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
...@@ -941,7 +942,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -941,7 +942,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# We transpose back here to shape [bsz, len, hidden_dim] # We transpose back here to shape [bsz, len, hidden_dim]
core_out = core_out.transpose(0, 1).contiguous() core_out = core_out.transpose(0, 1).contiguous()
if return_tuple: if not return_dict:
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TransfoXLModelOutput( return TransfoXLModelOutput(
...@@ -1013,7 +1014,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1013,7 +1014,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -1023,7 +1024,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1023,7 +1024,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None: if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1) bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None: elif inputs_embeds is not None:
...@@ -1038,7 +1039,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1038,7 +1039,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
...@@ -1048,7 +1049,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1048,7 +1049,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else () prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None
if return_tuple: if not return_dict:
output = (prediction_scores,) + transformer_outputs[1:] output = (prediction_scores,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
......
...@@ -1167,7 +1167,7 @@ class SQuADHead(nn.Module): ...@@ -1167,7 +1167,7 @@ class SQuADHead(nn.Module):
cls_index: Optional[torch.LongTensor] = None, cls_index: Optional[torch.LongTensor] = None,
is_impossible: Optional[torch.LongTensor] = None, is_impossible: Optional[torch.LongTensor] = None,
p_mask: Optional[torch.FloatTensor] = None, p_mask: Optional[torch.FloatTensor] = None,
return_tuple: bool = False, return_dict: bool = False,
) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
""" """
Args: Args:
...@@ -1184,8 +1184,8 @@ class SQuADHead(nn.Module): ...@@ -1184,8 +1184,8 @@ class SQuADHead(nn.Module):
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked. 1.0 means token should be masked.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a plain tuple instead of a :class:`~transformers.file_utils.ModelOuput`. Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
Returns: Returns:
""" """
...@@ -1214,7 +1214,7 @@ class SQuADHead(nn.Module): ...@@ -1214,7 +1214,7 @@ class SQuADHead(nn.Module):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5 total_loss += cls_loss * 0.5
return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss) return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
else: else:
# during inference, compute the end logits based on beam search # during inference, compute the end logits based on beam search
...@@ -1244,7 +1244,7 @@ class SQuADHead(nn.Module): ...@@ -1244,7 +1244,7 @@ class SQuADHead(nn.Module):
start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
if return_tuple: if not return_dict:
return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
else: else:
return SquadHeadOutput( return SquadHeadOutput(
......
...@@ -367,8 +367,9 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -367,8 +367,9 @@ XLM_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -482,13 +483,13 @@ class XLMModel(XLMPreTrainedModel): ...@@ -482,13 +483,13 @@ class XLMModel(XLMPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None: if input_ids is not None:
bs, slen = input_ids.size() bs, slen = input_ids.size()
...@@ -595,7 +596,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -595,7 +596,7 @@ class XLMModel(XLMPreTrainedModel):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
if return_tuple: if not return_dict:
return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
...@@ -693,7 +694,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -693,7 +694,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -703,7 +704,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -703,7 +704,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -717,13 +718,13 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -717,13 +718,13 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided. outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided.
if return_tuple: if not return_dict:
return outputs + transformer_outputs[1:] return outputs + transformer_outputs[1:]
return MaskedLMOutput( return MaskedLMOutput(
...@@ -770,7 +771,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -770,7 +771,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -779,7 +780,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -779,7 +780,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -793,7 +794,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -793,7 +794,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -809,7 +810,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -809,7 +810,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + transformer_outputs[1:] output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -857,7 +858,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -857,7 +858,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -869,7 +870,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -869,7 +870,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -883,7 +884,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -883,7 +884,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -910,7 +911,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -910,7 +911,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if return_tuple: if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:] output = (start_logits, end_logits) + transformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
...@@ -957,7 +958,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -957,7 +958,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
p_mask=None, p_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -984,7 +985,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -984,7 +985,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> import torch >>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
...@@ -993,7 +994,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -993,7 +994,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss >>> loss = outputs.loss
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -1007,7 +1008,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -1007,7 +1008,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1019,10 +1020,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -1019,10 +1020,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
cls_index=cls_index, cls_index=cls_index,
is_impossible=is_impossible, is_impossible=is_impossible,
p_mask=p_mask, p_mask=p_mask,
return_tuple=return_tuple, return_dict=return_dict,
) )
if return_tuple: if not return_dict:
return outputs + transformer_outputs[1:] return outputs + transformer_outputs[1:]
return XLMForQuestionAnsweringOutput( return XLMForQuestionAnsweringOutput(
...@@ -1074,14 +1075,14 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1074,14 +1075,14 @@ class XLMForTokenClassification(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -1095,7 +1096,7 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1095,7 +1096,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1117,7 +1118,7 @@ class XLMForTokenClassification(XLMPreTrainedModel): ...@@ -1117,7 +1118,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[1:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -1162,7 +1163,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel): ...@@ -1162,7 +1163,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1170,7 +1171,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel): ...@@ -1170,7 +1171,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
...@@ -1204,7 +1205,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel): ...@@ -1204,7 +1205,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
...@@ -1216,7 +1217,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel): ...@@ -1216,7 +1217,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
if return_tuple: if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:] output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
......
...@@ -53,12 +53,6 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -53,12 +53,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
""" """
......
...@@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput): ...@@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput): ...@@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput): ...@@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput): ...@@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): ...@@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
heads. heads.
""" """
loss: Optional[torch.FloatTensor] loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor end_logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None
...@@ -891,8 +891,9 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -891,8 +891,9 @@ XLNET_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -1051,13 +1052,13 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1051,13 +1052,13 @@ class XLNetModel(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
...@@ -1239,7 +1240,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1239,7 +1240,7 @@ class XLNetModel(XLNetPreTrainedModel):
else: else:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
if return_tuple: if not return_dict:
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None) return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
return XLNetModelOutput( return XLNetModelOutput(
...@@ -1325,7 +1326,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1325,7 +1326,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
...@@ -1344,7 +1345,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1344,7 +1345,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
import torch import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
# We show how to setup inputs to predict a next token using a bi-directional context. # We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
...@@ -1369,7 +1370,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1369,7 +1370,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
loss = outputs.loss loss = outputs.loss
next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
...@@ -1385,7 +1386,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1385,7 +1386,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
logits = self.lm_loss(transformer_outputs[0]) logits = self.lm_loss(transformer_outputs[0])
...@@ -1396,7 +1397,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1396,7 +1397,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + transformer_outputs[1:] output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -1447,7 +1448,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1447,7 +1448,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
...@@ -1456,7 +1457,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1456,7 +1457,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
...@@ -1472,7 +1473,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1472,7 +1473,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1489,7 +1490,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1489,7 +1490,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + transformer_outputs[1:] output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -1539,7 +1540,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1539,7 +1540,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1547,7 +1548,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1547,7 +1548,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
outputs = self.transformer( outputs = self.transformer(
...@@ -1563,7 +1564,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1563,7 +1564,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1584,7 +1585,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1584,7 +1585,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[1:] output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -1634,7 +1635,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1634,7 +1635,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1642,7 +1643,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1642,7 +1643,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
...@@ -1669,7 +1670,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1669,7 +1670,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1683,7 +1684,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1683,7 +1684,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels.view(-1)) loss = loss_fct(reshaped_logits, labels.view(-1))
if return_tuple: if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:] output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -1734,7 +1735,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1734,7 +1735,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1746,7 +1747,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1746,7 +1747,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
outputs = self.transformer( outputs = self.transformer(
...@@ -1762,7 +1763,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1762,7 +1763,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1789,7 +1790,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1789,7 +1790,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if return_tuple: if not return_dict:
output = (start_logits, end_logits) + outputs[1:] output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
...@@ -1842,7 +1843,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1842,7 +1843,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1869,7 +1870,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1869,7 +1870,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> import torch >>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
...@@ -1878,7 +1879,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1878,7 +1879,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> loss = outputs.loss >>> loss = outputs.loss
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
...@@ -1894,7 +1895,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1894,7 +1895,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
...@@ -1924,7 +1925,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1924,7 +1925,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5 total_loss += cls_loss * 0.5
if return_tuple: if not return_dict:
return (total_loss,) + transformer_outputs[1:] return (total_loss,) + transformer_outputs[1:]
else: else:
return XLNetForQuestionAnsweringOutput( return XLNetForQuestionAnsweringOutput(
...@@ -1966,7 +1967,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1966,7 +1967,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
hidden_states, start_states=start_states, cls_index=cls_index hidden_states, start_states=start_states, cls_index=cls_index
) # Shape (batch size,): one single `cls_logits` for each sample ) # Shape (batch size,): one single `cls_logits` for each sample
if return_tuple: if not return_dict:
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
return outputs + transformer_outputs[1:] return outputs + transformer_outputs[1:]
else: else:
......
...@@ -2122,6 +2122,6 @@ def pipeline( ...@@ -2122,6 +2122,6 @@ def pipeline(
"Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
"Trying to load the model with Tensorflow." "Trying to load the model with Tensorflow."
) )
model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs) model = model_class.from_pretrained(model, config=config, **model_kwargs)
return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
...@@ -661,9 +661,7 @@ class Trainer: ...@@ -661,9 +661,7 @@ class Trainer:
if self.args.past_index >= 0 and self._past is not None: if self.args.past_index >= 0 and self._past is not None:
inputs["mems"] = self._past inputs["mems"] = self._past
# Our model outputs do not work with DataParallel, so forcing return tuple.
if isinstance(model, nn.DataParallel):
inputs["return_tuple"] = True
return inputs return inputs
def training_step( def training_step(
......
...@@ -260,8 +260,9 @@ XXX_INPUTS_DOCSTRING = r""" ...@@ -260,8 +260,9 @@ XXX_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -310,13 +311,13 @@ class XxxModel(XxxPreTrainedModel): ...@@ -310,13 +311,13 @@ class XxxModel(XxxPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
) )
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -351,7 +352,7 @@ class XxxModel(XxxPreTrainedModel): ...@@ -351,7 +352,7 @@ class XxxModel(XxxPreTrainedModel):
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
if return_tuple: if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:] return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling( return BaseModelOutputWithPooling(
...@@ -393,7 +394,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -393,7 +394,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -402,7 +403,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -402,7 +403,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -413,7 +414,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -413,7 +414,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -424,7 +425,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): ...@@ -424,7 +425,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss() # -100 index = padding token loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple: if not return_dict:
output = (prediction_scores,) + outputs[2:] output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
...@@ -470,7 +471,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -470,7 +471,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -479,7 +480,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -479,7 +480,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -490,7 +491,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -490,7 +491,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -508,7 +509,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): ...@@ -508,7 +509,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -550,7 +551,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -550,7 +551,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -558,7 +559,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -558,7 +559,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
...@@ -580,7 +581,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -580,7 +581,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -594,7 +595,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel): ...@@ -594,7 +595,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels) loss = loss_fct(reshaped_logits, labels)
if return_tuple: if not return_dict:
output = (reshaped_logits,) + outputs[2:] output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -637,14 +638,14 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -637,14 +638,14 @@ class XxxForTokenClassification(XxxPreTrainedModel):
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -655,7 +656,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -655,7 +656,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -677,7 +678,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): ...@@ -677,7 +678,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple: if not return_dict:
output = (logits,) + outputs[2:] output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
...@@ -720,7 +721,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -720,7 +721,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_tuple=None, return_dict=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -732,7 +733,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -732,7 +733,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
""" """
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer( outputs = self.transformer(
input_ids, input_ids,
...@@ -743,7 +744,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -743,7 +744,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_tuple=return_tuple, return_dict=return_dict,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -770,7 +771,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): ...@@ -770,7 +771,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions) end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
if return_tuple: if not return_dict:
output = (start_logits, end_logits) + outputs[2:] output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output return ((total_loss,) + output) if total_loss is not None else output
......
...@@ -74,6 +74,7 @@ class ModelTesterMixin: ...@@ -74,6 +74,7 @@ class ModelTesterMixin:
def test_save_load(self): def test_save_load(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
...@@ -803,8 +804,6 @@ class ModelTesterMixin: ...@@ -803,8 +804,6 @@ class ModelTesterMixin:
# Wrap model in nn.DataParallel # Wrap model in nn.DataParallel
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Our model outputs do not work with DataParallel, so forcing return tuple.
inputs_dict["return_tuple"] = True
with torch.no_grad(): with torch.no_grad():
_ = model(**self._prepare_for_class(inputs_dict, model_class)) _ = model(**self._prepare_for_class(inputs_dict, model_class))
......
...@@ -329,7 +329,6 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -329,7 +329,6 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
import tempfile import tempfile
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
config_and_inputs[0].return_tuple = True
model = T5Model(config_and_inputs[0]).to(torch_device) model = T5Model(config_and_inputs[0]).to(torch_device)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
torch.onnx.export( torch.onnx.export(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment