"vscode:/vscode.git/clone" did not exist on "d8bc1a4e5124cd70fede43ad6ce4aa4095f07de3"
Unverified Commit c67d1a02 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Tf model outputs (#6247)

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* Add new models and fix issues

* Quality improvements

* Add T5

* A bit of cleanup

* Fix for slow tests

* Style
parent bd0eab35
...@@ -22,6 +22,7 @@ import tensorflow as tf ...@@ -22,6 +22,7 @@ import tensorflow as tf
from .configuration_flaubert import FlaubertConfig from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_outputs import TFBaseModelOutput
from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_utils import keras_serializable, shape_list
from .modeling_tf_xlm import ( from .modeling_tf_xlm import (
TFXLMForMultipleChoice, TFXLMForMultipleChoice,
...@@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
self.pre_norm = getattr(config, "pre_norm", False) self.pre_norm = getattr(config, "pre_norm", False)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
def call( def call(
self, self,
...@@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
# removed: src_enc=None, src_len=None # removed: src_enc=None, src_len=None
...@@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
tensor = tensor * mask[..., tf.newaxis] tensor = tensor * mask[..., tf.newaxis]
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
# LayerDrop # LayerDrop
dropout_probability = random.uniform(0, 1) dropout_probability = random.uniform(0, 1)
...@@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if not return_dict:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
......
This diff is collapsed.
This diff is collapsed.
...@@ -17,12 +17,21 @@ ...@@ -17,12 +17,21 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding ...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "OpenAIGPTConfig"
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer" _TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
super().__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
...@@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
all_attentions = [] all_attentions = () if output_attentions else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
for i, block in enumerate(self.h): for i, block in enumerate(self.h):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
...@@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training)
hidden_states = outputs[0] hidden_states = outputs[0]
if output_attentions: if output_attentions:
all_attentions.append(outputs[1]) all_attentions = all_attentions + (outputs[1],)
hidden_states = tf.reshape(hidden_states, output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state # Add last hidden state
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, (all hidden_states), (attentions) if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
)
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
...@@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): ...@@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
lm_logits: tf.Tensor = None
mc_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
OPENAI_GPT_START_DOCSTRING = r""" OPENAI_GPT_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
logits = self.transformer.tokens_embed(hidden_states, mode="linear") logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:]
loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids=None, mc_token_ids=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
...@@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Selected in the range ``[0, input_ids.size(-1) - 1]``. Selected in the range ``[0, input_ids.size(-1) - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs[8] if len(inputs) > 8 else output_hidden_states
return_dict = inputs[9] if len(inputs) > 9 else return_dict
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
input_shapes = shape_list(input_ids) input_shapes = shape_list(input_ids)
...@@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1) mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, (all hidden_states), (attentions) if not return_dict:
return (lm_logits, mc_logits) + transformer_outputs[1:]
return TFOpenAIGPTDoubleHeadsModelOutput(
lm_logits=lm_logits,
mc_logits=mc_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
""" """
......
This diff is collapsed.
...@@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput): ...@@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput):
@dataclass @dataclass
class XLNetForMultipleChoiceOutput(ModelOutput): class XLNetForMultipleChoiceOutput(ModelOutput):
""" """
Base class for outputs of multiple choice models. Output type of :class:`~transformers.XLNetForMultipleChoice`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
...@@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput): ...@@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
""" """
Base class for outputs of question answering models. Output type of :class:`~transformers.XLNetForQuestionAnsweringSimple`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
...@@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): ...@@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput): class XLNetForQuestionAnsweringOutput(ModelOutput):
""" """
Base class for outputs of question answering models using a :obj:`SquadHead`. Output type of :class:`~transformers.XLNetForQuestionAnswering`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
...@@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel):
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
output = output.permute(1, 0, 2).contiguous() output = output.permute(1, 0, 2).contiguous()
# TODO Teven: fix this test to only use use_cache.
if not use_cache: if not use_cache:
new_mems = None new_mems = None
......
...@@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow ...@@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow
if is_tf_available(): if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_xxx import ( from transformers.modeling_tf_xxx import (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
...@@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
( (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
...@@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxModel(config=config) model = TFXxxModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_xxx_for_masked_lm( def create_and_check_xxx_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFXxxForMaskedLM(config=config) model = TFXxxForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
def create_and_check_xxx_for_sequence_classification( def create_and_check_xxx_for_sequence_classification(
...@@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForSequenceClassification(config=config) model = TFXxxForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = TFXxxForMultipleChoice(config=config)
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
inputs = {
"input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids,
}
result = model(inputs)
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_xxx_for_token_classification( def create_and_check_xxx_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForTokenClassification(config=config) model = TFXxxForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
) )
...@@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxForQuestionAnswering(config=config) model = TFXxxForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
......
This diff is collapsed.
This diff is collapsed.
...@@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase): ...@@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
[[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32, [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32,
) # J'aime le camembert !" ) # J'aime le camembert !"
output = model(input_ids)[0] output = model(input_ids)["last_hidden_state"]
expected_shape = tf.TensorShape((1, 10, 768)) expected_shape = tf.TensorShape((1, 10, 768))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice. # compare the actual values for a slice.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment