Unverified Commit 91cb9546 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Switch from return_tuple to return_dict (#6138)



* Switch from return_tuple to return_dict

* Fix test

* [WIP] Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleC… (#5614)

* Test TF Flaubert + Add {XLM, Flaubert}{TokenClassification, MultipleChoice} models and tests

* AutoModels


Tiny tweaks

* Style

* Final changes before merge

* Re-order for simpler review

* Final fixes

* Addressing @sgugger's comments

* Test MultipleChoice

* Rework TF trainer (#6038)

* Fully rework training/prediction loops

* fix method name

* Fix variable name

* Fix property name

* Fix scope

* Fix method name

* Fix tuple index

* Fix tuple index

* Fix indentation

* Fix variable name

* fix eval before log

* Add drop remainder for test dataset

* Fix step number + fix logging datetime

* fix eval loss value

* use global step instead of step + fix logging at step 0

* Fix logging datetime

* Fix global_step usage

* Fix breaking loop + logging datetime

* Fix step in prediction loop

* Fix step breaking

* Fix train/test loops

* Force TF at least 2.2 for the trainer

* Use assert_cardinality to facilitate the dataset size computation

* Log steps per epoch

* Make tfds compliant with TPU

* Make tfds compliant with TPU

* Use TF dataset enumerate instead of the Python one

* revert previous commit

* Fix data_dir

* Apply style

* rebase on master

* Address Sylvain's comments

* Address Sylvain's and Lysandre comments

* Trigger CI

* Remove unused import

* Switch from return_tuple to return_dict

* Fix test

* Add recent model
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarJulien Plu <plu.julien@gmail.com>
parent 562b6369
......@@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput):
heads.
"""
lm_loss: Optional[torch.FloatTensor]
mc_loss: Optional[torch.FloatTensor]
lm_logits: torch.FloatTensor
mc_logits: torch.FloatTensor
lm_loss: Optional[torch.FloatTensor] = None
mc_loss: Optional[torch.FloatTensor] = None
lm_logits: torch.FloatTensor = None
mc_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -374,8 +374,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -425,13 +426,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -496,7 +497,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if return_tuple:
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return BaseModelOutput(
......@@ -538,7 +539,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -548,7 +549,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
......@@ -559,7 +560,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states)
......@@ -573,7 +574,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if return_tuple:
if not return_dict:
output = (lm_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -622,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
mc_labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
**kwargs
):
r"""
......@@ -650,7 +651,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True)
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
model.resize_token_embeddings(len(tokenizer))
......@@ -662,7 +663,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
lm_logits = outputs.lm_logits
mc_logits = outputs.mc_logits
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if "lm_labels" in kwargs:
warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
......@@ -680,7 +681,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
......@@ -698,7 +699,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
loss_fct = CrossEntropyLoss()
mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
if return_tuple:
if not return_dict:
output = (lm_logits, mc_logits) + transformer_outputs[1:]
if mc_loss is not None:
output = (mc_loss,) + output
......
......@@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput):
"""
last_hidden_state: torch.FloatTensor
pooler_output: torch.FloatTensor
pooler_output: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput):
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_key_values: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput):
self-attention heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput):
self-attention heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
start_logits: torch.FloatTensor
end_logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
self-attention heads.
"""
loss: Optional[torch.FloatTensor]
start_logits: torch.FloatTensor
end_logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
decoder_past_key_values: Optional[List[torch.FloatTensor]] = None
decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
......
......@@ -39,13 +39,7 @@ from .file_utils import (
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
MaskedLMOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
)
from .modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
......@@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -1922,8 +1916,9 @@ REFORMER_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -1962,7 +1957,7 @@ class ReformerModel(ReformerPreTrainedModel):
@add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/reformer-crime-and-punishment",
output_type=BaseModelOutput,
output_type=ReformerModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def forward(
......@@ -1977,40 +1972,14 @@ class ReformerModel(ReformerPreTrainedModel):
use_cache=None,
output_hidden_states=None,
output_attentions=None,
return_tuple=None,
return_dict=None,
):
r"""
Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with :obj:`tuple(0)` being the previous `buckets` of shape
:obj:`(batch_size, num_heads, num_hashes, sequence_length)`)
and :obj:`tuple(1)` being the previous `hidden_states` of shape
:obj:`(batch_size, sequence_length, hidden_size)`).
Contains pre-computed buckets and hidden-states that can be used (see
``past_buckets_states`` input) to speed up sequential decoding.
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
use_cache = use_cache if use_cache is not None else self.config.use_cache
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -2102,7 +2071,7 @@ class ReformerModel(ReformerPreTrainedModel):
hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
attentions = encoder_outputs.all_attentions if output_attentions else None
if return_tuple:
if not return_dict:
return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None)
return ReformerModelOutput(
last_hidden_state=sequence_output,
......@@ -2208,7 +2177,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
use_cache=None,
output_hidden_states=None,
output_attentions=None,
return_tuple=None,
return_dict=None,
labels=None,
):
r"""
......@@ -2218,7 +2187,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer(
input_ids,
......@@ -2231,7 +2200,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
use_cache=use_cache,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = reformer_outputs[0]
......@@ -2246,7 +2215,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + reformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -2326,7 +2295,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
labels=None,
output_hidden_states=None,
output_attentions=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -2334,7 +2303,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer(
input_ids,
......@@ -2346,7 +2315,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
use_cache=False, # no causal mask
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = reformer_outputs[0]
......@@ -2357,7 +2326,7 @@ class ReformerForMaskedLM(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + reformer_outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
......@@ -2408,7 +2377,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
labels=None,
output_hidden_states=None,
output_attentions=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -2427,7 +2396,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
num_hashes=num_hashes,
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -2443,7 +2412,7 @@ class ReformerForSequenceClassification(ReformerPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -2511,7 +2480,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
end_positions=None,
output_hidden_states=None,
output_attentions=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -2523,7 +2492,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
reformer_outputs = self.reformer(
input_ids,
......@@ -2535,7 +2504,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
use_cache=False, # no causal mask
output_hidden_states=output_hidden_states,
output_attentions=output_attentions,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = reformer_outputs[0]
......@@ -2562,7 +2531,7 @@ class ReformerForQuestionAnswering(ReformerPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if return_tuple:
if not return_dict:
output = (start_logits, end_logits) + reformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
......
......@@ -143,8 +143,9 @@ ROBERTA_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -208,7 +209,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
**kwargs
):
r"""
......@@ -227,7 +228,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
......@@ -238,7 +239,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
......@@ -248,7 +249,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple:
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
......@@ -321,7 +322,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -330,7 +331,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
......@@ -341,7 +342,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
......@@ -356,7 +357,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -401,7 +402,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -409,7 +410,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
......@@ -431,7 +432,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
pooled_output = outputs[1]
......@@ -444,7 +445,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if return_tuple:
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -490,14 +491,14 @@ class RobertaForTokenClassification(BertPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
......@@ -508,7 +509,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -530,7 +531,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -595,7 +596,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -607,7 +608,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.roberta(
input_ids,
......@@ -618,7 +619,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -645,7 +646,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if return_tuple:
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
......
......@@ -675,7 +675,7 @@ class T5Stack(T5PreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
use_cache = use_cache if use_cache is not None else self.config.use_cache
......@@ -683,7 +683,7 @@ class T5Stack(T5PreTrainedModel):
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -787,7 +787,7 @@ class T5Stack(T5PreTrainedModel):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if return_tuple:
if not return_dict:
return tuple(
v
for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions]
......@@ -868,8 +868,9 @@ T5_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -930,7 +931,7 @@ class T5Model(T5PreTrainedModel):
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
**kwargs,
):
r"""
......@@ -957,7 +958,7 @@ class T5Model(T5PreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Encode if needed (training, first prediction pass)
if encoder_outputs is None:
......@@ -968,9 +969,9 @@ class T5Model(T5PreTrainedModel):
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
......@@ -1005,11 +1006,11 @@ class T5Model(T5PreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
if return_tuple:
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
return decoder_outputs + encoder_outputs
......@@ -1081,7 +1082,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
**kwargs,
):
r"""
......@@ -1100,13 +1101,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
>>> loss = outputs.loss
>>> logits = outputs.logits
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model.generate(input_ids)
"""
......@@ -1126,7 +1128,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# Encode if needed (training, first prediction pass)
if encoder_outputs is None:
......@@ -1138,9 +1140,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput):
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
......@@ -1174,7 +1176,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = decoder_outputs[0]
......@@ -1190,7 +1192,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None
if return_tuple:
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
......
......@@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput):
"""
last_hidden_state: torch.FloatTensor
mems: List[torch.FloatTensor]
mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput):
heads.
"""
losses: Optional[torch.FloatTensor]
prediction_scores: torch.FloatTensor
mems: List[torch.FloatTensor]
losses: Optional[torch.FloatTensor] = None
prediction_scores: torch.FloatTensor = None
mems: List[torch.FloatTensor] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -695,8 +695,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -836,13 +837,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
......@@ -941,7 +942,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# We transpose back here to shape [bsz, len, hidden_dim]
core_out = core_out.transpose(0, 1).contiguous()
if return_tuple:
if not return_dict:
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TransfoXLModelOutput(
......@@ -1013,7 +1014,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -1023,7 +1024,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None:
......@@ -1038,7 +1039,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
last_hidden = transformer_outputs[0]
......@@ -1048,7 +1049,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None
if return_tuple:
if not return_dict:
output = (prediction_scores,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......
......@@ -1167,7 +1167,7 @@ class SQuADHead(nn.Module):
cls_index: Optional[torch.LongTensor] = None,
is_impossible: Optional[torch.LongTensor] = None,
p_mask: Optional[torch.FloatTensor] = None,
return_tuple: bool = False,
return_dict: bool = False,
) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
"""
Args:
......@@ -1184,8 +1184,8 @@ class SQuADHead(nn.Module):
p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
1.0 means token should be masked.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a plain tuple instead of a :class:`~transformers.file_utils.ModelOuput`.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
Returns:
"""
......@@ -1214,7 +1214,7 @@ class SQuADHead(nn.Module):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5
return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss)
return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
else:
# during inference, compute the end logits based on beam search
......@@ -1244,7 +1244,7 @@ class SQuADHead(nn.Module):
start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
if return_tuple:
if not return_dict:
return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
else:
return SquadHeadOutput(
......
......@@ -367,8 +367,9 @@ XLM_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -482,13 +483,13 @@ class XLMModel(XLMPreTrainedModel):
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None:
bs, slen = input_ids.size()
......@@ -595,7 +596,7 @@ class XLMModel(XLMPreTrainedModel):
# move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1)
if return_tuple:
if not return_dict:
return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
......@@ -693,7 +694,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -703,7 +704,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
......@@ -717,13 +718,13 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided.
if return_tuple:
if not return_dict:
return outputs + transformer_outputs[1:]
return MaskedLMOutput(
......@@ -770,7 +771,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -779,7 +780,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
......@@ -793,7 +794,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
......@@ -809,7 +810,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -857,7 +858,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -869,7 +870,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
......@@ -883,7 +884,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = transformer_outputs[0]
......@@ -910,7 +911,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if return_tuple:
if not return_dict:
output = (start_logits, end_logits) + transformer_outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
......@@ -957,7 +958,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
p_mask=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -984,7 +985,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
......@@ -993,7 +994,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
transformer_outputs = self.transformer(
input_ids,
......@@ -1007,7 +1008,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
......@@ -1019,10 +1020,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
cls_index=cls_index,
is_impossible=is_impossible,
p_mask=p_mask,
return_tuple=return_tuple,
return_dict=return_dict,
)
if return_tuple:
if not return_dict:
return outputs + transformer_outputs[1:]
return XLMForQuestionAnsweringOutput(
......@@ -1074,14 +1075,14 @@ class XLMForTokenClassification(XLMPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
......@@ -1095,7 +1096,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -1117,7 +1118,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -1162,7 +1163,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -1170,7 +1171,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
......@@ -1204,7 +1205,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
......@@ -1216,7 +1217,7 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if return_tuple:
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......
......@@ -53,12 +53,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
"""
......
......@@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
heads.
"""
loss: Optional[torch.FloatTensor]
start_logits: torch.FloatTensor
end_logits: torch.FloatTensor
loss: Optional[torch.FloatTensor] = None
start_logits: torch.FloatTensor = None
end_logits: torch.FloatTensor = None
mems: Optional[List[torch.FloatTensor]] = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
......@@ -891,8 +891,9 @@ XLNET_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -1051,13 +1052,13 @@ class XLNetModel(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
......@@ -1239,7 +1240,7 @@ class XLNetModel(XLNetPreTrainedModel):
else:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
if return_tuple:
if not return_dict:
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
return XLNetModelOutput(
......@@ -1325,7 +1326,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
......@@ -1344,7 +1345,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
import torch
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True)
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
......@@ -1369,7 +1370,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
loss = outputs.loss
next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer(
......@@ -1385,7 +1386,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
logits = self.lm_loss(transformer_outputs[0])
......@@ -1396,7 +1397,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -1447,7 +1448,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`)
......@@ -1456,7 +1457,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer(
......@@ -1472,7 +1473,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
......@@ -1489,7 +1490,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -1539,7 +1540,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -1547,7 +1548,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
outputs = self.transformer(
......@@ -1563,7 +1564,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -1584,7 +1585,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -1634,7 +1635,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -1642,7 +1643,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -1669,7 +1670,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
output = transformer_outputs[0]
......@@ -1683,7 +1684,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels.view(-1))
if return_tuple:
if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
......@@ -1734,7 +1735,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -1746,7 +1747,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
outputs = self.transformer(
......@@ -1762,7 +1763,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -1789,7 +1790,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if return_tuple:
if not return_dict:
output = (start_logits, end_logits) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
......@@ -1842,7 +1843,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -1869,7 +1870,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> start_positions = torch.tensor([1])
......@@ -1878,7 +1879,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> loss = outputs.loss
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
transformer_outputs = self.transformer(
......@@ -1894,7 +1895,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......@@ -1924,7 +1925,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
# note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
total_loss += cls_loss * 0.5
if return_tuple:
if not return_dict:
return (total_loss,) + transformer_outputs[1:]
else:
return XLNetForQuestionAnsweringOutput(
......@@ -1966,7 +1967,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
hidden_states, start_states=start_states, cls_index=cls_index
) # Shape (batch size,): one single `cls_logits` for each sample
if return_tuple:
if not return_dict:
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
return outputs + transformer_outputs[1:]
else:
......
......@@ -2122,6 +2122,6 @@ def pipeline(
"Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
"Trying to load the model with Tensorflow."
)
model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs)
model = model_class.from_pretrained(model, config=config, **model_kwargs)
return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs)
......@@ -661,9 +661,7 @@ class Trainer:
if self.args.past_index >= 0 and self._past is not None:
inputs["mems"] = self._past
# Our model outputs do not work with DataParallel, so forcing return tuple.
if isinstance(model, nn.DataParallel):
inputs["return_tuple"] = True
return inputs
def training_step(
......
......@@ -260,8 +260,9 @@ XXX_INPUTS_DOCSTRING = r"""
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
"""
......@@ -310,13 +311,13 @@ class XxxModel(XxxPreTrainedModel):
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
......@@ -351,7 +352,7 @@ class XxxModel(XxxPreTrainedModel):
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
if return_tuple:
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPooling(
......@@ -393,7 +394,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......@@ -402,7 +403,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
......@@ -413,7 +414,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -424,7 +425,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if return_tuple:
if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
......@@ -470,7 +471,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -479,7 +480,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
......@@ -490,7 +491,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
pooled_output = outputs[1]
......@@ -508,7 +509,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -550,7 +551,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -558,7 +559,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
......@@ -580,7 +581,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
pooled_output = outputs[1]
......@@ -594,7 +595,7 @@ class XxxForMultipleChoice(XxxPreTrainedModel):
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
if return_tuple:
if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -637,14 +638,14 @@ class XxxForTokenClassification(XxxPreTrainedModel):
labels=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
......@@ -655,7 +656,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -677,7 +678,7 @@ class XxxForTokenClassification(XxxPreTrainedModel):
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if return_tuple:
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
......@@ -720,7 +721,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_tuple=None,
return_dict=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
......@@ -732,7 +733,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
"""
return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.transformer(
input_ids,
......@@ -743,7 +744,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_tuple=return_tuple,
return_dict=return_dict,
)
sequence_output = outputs[0]
......@@ -770,7 +771,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel):
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if return_tuple:
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
......
......@@ -74,6 +74,7 @@ class ModelTesterMixin:
def test_save_load(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config.return_dict = True
for model_class in self.all_model_classes:
model = model_class(config)
......@@ -803,8 +804,6 @@ class ModelTesterMixin:
# Wrap model in nn.DataParallel
model = torch.nn.DataParallel(model)
# Our model outputs do not work with DataParallel, so forcing return tuple.
inputs_dict["return_tuple"] = True
with torch.no_grad():
_ = model(**self._prepare_for_class(inputs_dict, model_class))
......
......@@ -329,7 +329,6 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
import tempfile
config_and_inputs = self.model_tester.prepare_config_and_inputs()
config_and_inputs[0].return_tuple = True
model = T5Model(config_and_inputs[0]).to(torch_device)
with tempfile.TemporaryDirectory() as tmpdirname:
torch.onnx.export(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment