Unverified Commit c67d1a02 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Tf model outputs (#6247)

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* TF outputs and test on BERT

* Albert to DistilBert

* All remaining TF models except T5

* Documentation

* One file forgotten

* Add new models and fix issues

* Quality improvements

* Add T5

* A bit of cleanup

* Fix for slow tests

* Style
parent bd0eab35
...@@ -22,6 +22,7 @@ import tensorflow as tf ...@@ -22,6 +22,7 @@ import tensorflow as tf
from .configuration_flaubert import FlaubertConfig from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_outputs import TFBaseModelOutput
from .modeling_tf_utils import keras_serializable, shape_list from .modeling_tf_utils import keras_serializable, shape_list
from .modeling_tf_xlm import ( from .modeling_tf_xlm import (
TFXLMForMultipleChoice, TFXLMForMultipleChoice,
...@@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -103,6 +104,11 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -126,6 +132,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
self.pre_norm = getattr(config, "pre_norm", False) self.pre_norm = getattr(config, "pre_norm", False)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
def call( def call(
self, self,
...@@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -140,6 +147,7 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
# removed: src_enc=None, src_len=None # removed: src_enc=None, src_len=None
...@@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -155,7 +163,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -168,12 +177,14 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -260,8 +271,8 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
tensor = tensor * mask[..., tf.newaxis] tensor = tensor * mask[..., tf.newaxis]
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
# LayerDrop # LayerDrop
dropout_probability = random.uniform(0, 1) dropout_probability = random.uniform(0, 1)
...@@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer): ...@@ -321,12 +332,9 @@ class TFFlaubertMainLayer(TFXLMMainLayer):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if not return_dict:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
......
...@@ -17,12 +17,21 @@ ...@@ -17,12 +17,21 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding ...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "GPT2Config"
_TOKENIZER_FOR_DOC = "GPT2Tokenizer" _TOKENIZER_FOR_DOC = "GPT2Tokenizer"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -214,12 +224,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -214,12 +224,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.use_cache = config.use_cache self.use_cache = config.use_cache
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
self.output_hidden_states = self.output_hidden_states
self.output_attentions = self.output_attentions
self.wte = TFSharedEmbeddings( self.wte = TFSharedEmbeddings(
config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte"
...@@ -259,6 +268,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -259,6 +268,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -272,7 +282,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -272,7 +282,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs[10] if len(inputs) > 10 else return_dict
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
...@@ -284,13 +295,15 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -284,13 +295,15 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 10, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 11, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
use_cache = use_cache if use_cache is not None else self.use_cache use_cache = use_cache if use_cache is not None else self.use_cache
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -355,9 +368,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -355,9 +368,9 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
presents = () presents = () if use_cache else None
all_attentions = [] all_attentions = () if output_attentions else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
for i, (block, layer_past) in enumerate(zip(self.h, past)): for i, (block, layer_past) in enumerate(zip(self.h, past)):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
...@@ -373,10 +386,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -373,10 +386,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
) )
hidden_states, present = outputs[:2] hidden_states, present = outputs[:2]
presents = presents + (present,) if use_cache:
presents = presents + (present,)
if output_attentions: if output_attentions:
all_attentions.append(outputs[2]) all_attentions = all_attentions + (outputs[2],)
hidden_states = self.ln_f(hidden_states) hidden_states = self.ln_f(hidden_states)
...@@ -385,18 +399,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -385,18 +399,20 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if use_cache:
outputs = outputs + (presents,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions) if not return_dict:
return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=presents,
hidden_states=all_hidden_states,
attentions=all_attentions,
)
class TFGPT2PreTrainedModel(TFPreTrainedModel): class TFGPT2PreTrainedModel(TFPreTrainedModel):
...@@ -408,6 +424,42 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): ...@@ -408,6 +424,42 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFGPT2DoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
lm_logits: tf.Tensor = None
mc_logits: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
GPT2_START_DOCSTRING = r""" GPT2_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -482,6 +534,11 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -482,6 +534,11 @@ GPT2_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -495,29 +552,13 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -495,29 +552,13 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="gpt2",
output_type=TFBaseModelOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -543,7 +584,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -543,7 +584,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="gpt2",
output_type=TFCausalLMOutputWithPast,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -556,6 +602,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -556,6 +602,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -563,31 +610,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -563,31 +610,12 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 10: if len(inputs) > 11:
inputs = inputs[:10] inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -602,6 +630,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -602,6 +630,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -609,15 +638,24 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -609,15 +638,24 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
logits = self.transformer.wte(hidden_states, mode="linear") logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:] loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -641,6 +679,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -641,6 +679,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
return self.transformer.wte return self.transformer.wte
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -654,6 +693,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -654,6 +693,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
...@@ -662,26 +702,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -662,26 +702,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as `input_ids` as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -717,8 +737,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -717,8 +737,10 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids
use_cache = inputs[8] if len(inputs) > 8 else use_cache use_cache = inputs[8] if len(inputs) > 8 else use_cache
output_attentions = inputs[9] if len(inputs) > 8 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
...@@ -730,9 +752,12 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -730,9 +752,12 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
input_shapes = shape_list(input_ids) input_shapes = shape_list(input_ids)
...@@ -755,6 +780,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -755,6 +780,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
use_cache, use_cache,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -762,6 +788,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -762,6 +788,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
lm_logits = self.transformer.wte(hidden_states, mode="linear") lm_logits = self.transformer.wte(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1) mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions) if not return_dict:
return (lm_logits, mc_logits) + transformer_outputs[1:]
return TFGPT2DoubleHeadsModelOutput(
lm_logits=lm_logits,
mc_logits=mc_logits,
past_key_values=transformer_outputs.past_key_values,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
...@@ -17,17 +17,31 @@ ...@@ -17,17 +17,31 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import tensorflow as tf import tensorflow as tf
from . import MobileBertConfig from . import MobileBertConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFNextSentencePredictorOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -44,6 +58,7 @@ from .tokenization_utils import BatchEncoding ...@@ -44,6 +58,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "MobileBertConfig"
_TOKENIZER_FOR_DOC = "MobileBertTokenizer" _TOKENIZER_FOR_DOC = "MobileBertTokenizer"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -541,9 +556,18 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): ...@@ -541,9 +556,18 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFMobileBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(self, hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states, training=False): def call(
all_hidden_states = () self,
all_attentions = () hidden_states,
attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=False,
):
all_hidden_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
...@@ -561,12 +585,11 @@ class TFMobileBertEncoder(tf.keras.layers.Layer): ...@@ -561,12 +585,11 @@ class TFMobileBertEncoder(tf.keras.layers.Layer):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) if not return_dict:
if output_hidden_states: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
outputs = outputs + (all_hidden_states,) return TFBaseModelOutput(
if output_attentions: last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
outputs = outputs + (all_attentions,) )
return outputs # outputs, (hidden states), (attentions)
class TFMobileBertPooler(tf.keras.layers.Layer): class TFMobileBertPooler(tf.keras.layers.Layer):
...@@ -660,6 +683,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -660,6 +683,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.return_dict = config.use_return_dict
self.embeddings = TFMobileBertEmbeddings(config, name="embeddings") self.embeddings = TFMobileBertEmbeddings(config, name="embeddings")
self.encoder = TFMobileBertEncoder(config, name="encoder") self.encoder = TFMobileBertEncoder(config, name="encoder")
...@@ -688,6 +712,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -688,6 +712,7 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -699,7 +724,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -699,7 +724,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -709,12 +735,14 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -709,12 +735,14 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -763,16 +791,22 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -763,16 +791,22 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
head_mask, head_mask,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
training=training, training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
outputs = (sequence_output, pooled_output,) + encoder_outputs[ if not return_dict:
1: return (sequence_output, pooled_output,) + encoder_outputs[1:]
] # add hidden_states and attentions if they are here
return outputs # sequence_output, pooled_output, (hidden_states), (attentions) return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class TFMobileBertPreTrainedModel(TFPreTrainedModel): class TFMobileBertPreTrainedModel(TFPreTrainedModel):
...@@ -784,6 +818,37 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel): ...@@ -784,6 +818,37 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "mobilebert" base_model_prefix = "mobilebert"
@dataclass
class TFMobileBertForPreTrainingOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFMobileBertForPreTrainingModel`.
Args:
prediction_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
prediction_logits: tf.Tensor = None
seq_relationship_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
MOBILEBERT_START_DOCSTRING = r""" MOBILEBERT_START_DOCSTRING = r"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class. This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and Use it as a regular TF 2.0 Keras Model and
...@@ -852,6 +917,13 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" ...@@ -852,6 +917,13 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
training (:obj:`boolean`, `optional`, defaults to :obj:`False`): training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -865,32 +937,13 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): ...@@ -865,32 +937,13 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during the original Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
return outputs return outputs
...@@ -911,25 +964,10 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -911,25 +964,10 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
return self.mobilebert.embeddings return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -943,16 +981,23 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -943,16 +981,23 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
>>> prediction_scores, seq_relationship_scores = outputs[:2] >>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output) seq_relationship_score = self.seq_relationship(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) if not return_dict:
return (prediction_scores, seq_relationship_score) + outputs[2:]
return TFMobileBertForPreTrainingOutput(
prediction_logits=prediction_scores,
seq_relationship_logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
...@@ -967,7 +1012,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ...@@ -967,7 +1012,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
return self.mobilebert.embeddings return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -978,6 +1028,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ...@@ -978,6 +1028,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -986,27 +1037,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ...@@ -986,27 +1037,12 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1019,18 +1055,22 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ...@@ -1019,18 +1055,22 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions) if not return_dict:
output = (prediction_scores,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
...@@ -1055,23 +1095,10 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1055,23 +1095,10 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls") self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -1087,14 +1114,19 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1087,14 +1114,19 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
>>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
""" """
return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
pooled_output = outputs[1] pooled_output = outputs[1]
seq_relationship_score = self.cls(pooled_output) seq_relationship_score = self.cls(pooled_output)
outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if not return_dict:
return (seq_relationship_score,) + outputs[2:]
return outputs # seq_relationship_score, (hidden_states), (attentions) return TFNextSentencePredictorOutput(
logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1114,7 +1146,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1114,7 +1146,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1125,6 +1162,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1125,6 +1162,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1134,27 +1172,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1134,27 +1172,12 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1167,6 +1190,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1167,6 +1190,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1175,13 +1199,15 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1175,13 +1199,15 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1200,7 +1226,12 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1200,7 +1226,12 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1211,6 +1242,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1211,6 +1242,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
...@@ -1224,30 +1256,13 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1224,30 +1256,13 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
...@@ -1261,6 +1276,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1261,6 +1276,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1271,15 +1287,23 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1271,15 +1287,23 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
if not return_dict:
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1307,7 +1331,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1307,7 +1331,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -1318,6 +1347,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1318,6 +1347,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1326,24 +1356,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1326,24 +1356,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1354,8 +1366,9 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1354,8 +1366,9 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -1365,10 +1378,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1365,10 +1378,12 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -1395,19 +1410,23 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1395,19 +1410,23 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1427,7 +1446,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1427,7 +1446,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="google/mobilebert-uncased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1438,6 +1462,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1438,6 +1462,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1445,27 +1470,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1445,27 +1470,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1478,6 +1488,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1478,6 +1488,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1486,10 +1497,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1486,10 +1497,12 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
...@@ -17,12 +17,21 @@ ...@@ -17,12 +17,21 @@
import logging import logging
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding ...@@ -38,6 +47,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "OpenAIGPTConfig"
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer" _TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -208,6 +218,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
super().__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.num_hidden_layers = config.n_layer self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.n_embd = config.n_embd self.n_embd = config.n_embd
...@@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -247,6 +258,7 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -258,7 +270,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -268,12 +281,14 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -333,8 +348,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
output_shape = input_shape + [shape_list(hidden_states)[-1]] output_shape = input_shape + [shape_list(hidden_states)[-1]]
all_attentions = [] all_attentions = () if output_attentions else None
all_hidden_states = () all_hidden_states = () if output_hidden_states else None
for i, block in enumerate(self.h): for i, block in enumerate(self.h):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
...@@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -342,22 +357,24 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training) outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions, training=training)
hidden_states = outputs[0] hidden_states = outputs[0]
if output_attentions: if output_attentions:
all_attentions.append(outputs[1]) all_attentions = all_attentions + (outputs[1],)
hidden_states = tf.reshape(hidden_states, output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state # Add last hidden state
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if output_hidden_states:
outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, (all hidden_states), (attentions) if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions,
)
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
...@@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): ...@@ -369,6 +386,35 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFOpenAIGPTDoubleHeadsModelOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
lm_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
lm_logits: tf.Tensor = None
mc_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
OPENAI_GPT_START_DOCSTRING = r""" OPENAI_GPT_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -436,6 +482,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -449,25 +500,13 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -486,7 +525,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="openai-gpt",
output_type=TFCausalLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -497,6 +541,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -504,27 +549,12 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin ...@@ -537,21 +567,30 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
logits = self.transformer.tokens_embed(hidden_states, mode="linear") logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:]
loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, (all hidden_states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -575,6 +614,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFOpenAIGPTDoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -586,6 +626,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids=None, mc_token_ids=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
...@@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -594,27 +635,6 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Selected in the range ``[0, input_ids.size(-1) - 1]``. Selected in the range ``[0, input_ids.size(-1) - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
lm_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mc_prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -646,7 +666,9 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs[8] if len(inputs) > 8 else output_hidden_states
return_dict = inputs[9] if len(inputs) > 9 else return_dict
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -656,9 +678,12 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) mc_token_ids = inputs.get("mc_token_ids", mc_token_ids)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
input_shapes = shape_list(input_ids) input_shapes = shape_list(input_ids)
...@@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -679,6 +704,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -686,6 +712,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1) mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, (all hidden_states), (attentions) if not return_dict:
return (lm_logits, mc_logits) + transformer_outputs[1:]
return TFOpenAIGPTDoubleHeadsModelOutput(
lm_logits=lm_logits,
mc_logits=mc_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
from dataclasses import dataclass
from typing import List, Optional, Tuple
import tensorflow as tf
from .file_utils import ModelOutput
@dataclass
class TFBaseModelOutput(ModelOutput):
"""
Base class for model's outputs, with potential hidden states and attentions.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFBaseModelOutputWithPooling(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during pretraining.
This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFBaseModelOutputWithPast(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqModelOutput(ModelOutput):
"""
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the decoder of the model.
If ``decoder_past_key_values`` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
last_hidden_state: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFCausalLMOutput(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFCausalLMOutputWithPast(ModelOutput):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
``past_key_values`` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
past_key_values: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFMaskedLMOutput(ModelOutput):
"""
Base class for masked language models outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Masked languaged modeling (MLM) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqLMOutput(ModelOutput):
"""
Base class for sequence-to-sequence language models outputs.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Languaged modeling loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFNextSentencePredictorOutput(ModelOutput):
"""
Base class for outputs of models predicting if two sentences are consecutive or not.
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sentence classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqSequenceClassifierOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence sentence classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFMultipleChoiceModelOutput(ModelOutput):
"""
Base class for outputs of multiple choice models.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFTokenClassifierOutput(ModelOutput):
"""
Base class for outputs of token classification models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of question answering models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput):
"""
Base class for outputs of sequence-to-sequence question answering models.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
decoder_past_key_values (:obj:`List[tf.Tensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
List of :obj:`tf.Tensor` of length :obj:`config.n_layers`, with each tensor of shape
:obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
used (see ``decoder_past_key_values`` input) to speed up sequential decoding.
decoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
decoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
encoder_last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder of the model.
encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
self-attention heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
decoder_past_key_values: Optional[List[tf.Tensor]] = None
decoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
decoder_attentions: Optional[Tuple[tf.Tensor]] = None
encoder_last_hidden_state: Optional[tf.Tensor] = None
encoder_hidden_states: Optional[Tuple[tf.Tensor]] = None
encoder_attentions: Optional[Tuple[tf.Tensor]] = None
...@@ -28,6 +28,14 @@ from .file_utils import ( ...@@ -28,6 +28,14 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_outputs import (
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -44,6 +52,7 @@ from .tokenization_utils_base import BatchEncoding ...@@ -44,6 +52,7 @@ from .tokenization_utils_base import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "RobertaConfig"
_TOKENIZER_FOR_DOC = "RobertaTokenizer" _TOKENIZER_FOR_DOC = "RobertaTokenizer"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -190,6 +199,11 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -190,6 +199,11 @@ ROBERTA_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -203,32 +217,13 @@ class TFRobertaModel(TFRobertaPreTrainedModel): ...@@ -203,32 +217,13 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
return outputs return outputs
...@@ -276,7 +271,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -276,7 +271,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
return self.lm_head.decoder return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -287,6 +287,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -287,6 +287,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -296,27 +297,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -296,27 +297,12 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -329,6 +315,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -329,6 +315,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -337,13 +324,15 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -337,13 +324,15 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
class TFRobertaClassificationHead(tf.keras.layers.Layer): class TFRobertaClassificationHead(tf.keras.layers.Layer):
...@@ -385,7 +374,12 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -385,7 +374,12 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
self.classifier = TFRobertaClassificationHead(config, name="classifier") self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -396,30 +390,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -396,30 +390,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
r""" r"""
Return: labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: Labels for computing the sequence classification/regression loss.
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
Classification (or regression if config.num_labels==1) scores (before SoftMax). If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -432,19 +418,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -432,19 +418,22 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=training) logits = self.classifier(sequence_output, training=training)
outputs = (logits,) + outputs[2:] loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -472,7 +461,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -472,7 +461,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -483,6 +477,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -483,6 +477,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -491,24 +486,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -491,24 +486,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -519,8 +496,9 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -519,8 +496,9 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -530,10 +508,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -530,10 +508,12 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_attentions) output_hidden_states = inputs.get("output_hidden_states", output_attentions)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -555,19 +535,23 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -555,19 +535,23 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs if not return_dict:
output = (reshaped_logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -587,7 +571,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -587,7 +571,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -598,6 +587,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -598,6 +587,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -605,27 +595,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -605,27 +595,12 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -638,6 +613,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -638,6 +613,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -646,13 +622,15 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -646,13 +622,15 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -670,7 +648,12 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -670,7 +648,12 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="roberta-base",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -681,6 +664,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -681,6 +664,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
...@@ -694,30 +678,13 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -694,30 +678,13 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.roberta.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
...@@ -731,6 +698,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -731,6 +698,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -741,12 +709,20 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -741,12 +709,20 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
if not return_dict:
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
...@@ -25,7 +25,14 @@ import warnings ...@@ -25,7 +25,14 @@ import warnings
import tensorflow as tf import tensorflow as tf
from .configuration_t5 import T5Config from .configuration_t5 import T5Config
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_start_docstrings,
add_start_docstrings_to_callable,
replace_return_docstrings,
)
from .modeling_tf_outputs import TFSeq2SeqLMOutput, TFSeq2SeqModelOutput
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -39,6 +46,7 @@ from .tokenization_utils import BatchEncoding ...@@ -39,6 +46,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "T5Config"
_TOKENIZER_FOR_DOC = "T5Tokenizer" _TOKENIZER_FOR_DOC = "T5Tokenizer"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -575,8 +583,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -575,8 +583,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 7 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
...@@ -934,6 +942,7 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -934,6 +942,7 @@ class TFT5Model(TFT5PreTrainedModel):
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -948,29 +957,11 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -948,29 +957,11 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
r""" r"""
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
If `decoder_past_key_value_states` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -996,7 +987,8 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -996,7 +987,8 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs: if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
...@@ -1013,11 +1005,13 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -1013,11 +1005,13 @@ class TFT5Model(TFT5PreTrainedModel):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -1063,12 +1057,40 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -1063,12 +1057,40 @@ class TFT5Model(TFT5PreTrainedModel):
], ],
training=training, training=training,
) )
past = (
(encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None
)
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
return decoder_outputs + encoder_outputs
# If put before, this breaks the tf compilation.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: # This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch)
past = ((encoder_outputs, decoder_outputs[1]),) # TF refuses to compile anymore.
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] if not cast_bool_to_primitive(use_cache, self.config.use_cache):
decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:]
return decoder_outputs + encoder_outputs if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states):
encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:]
decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:]
if not cast_bool_to_primitive(output_attentions, self.config.output_attentions):
encoder_outputs = encoder_outputs + (None,)
decoder_outputs = decoder_outputs + (None,)
return TFSeq2SeqModelOutput(
last_hidden_state=decoder_outputs[0],
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs[2],
decoder_attentions=decoder_outputs[3],
encoder_last_hidden_state=encoder_outputs[0],
encoder_hidden_states=encoder_outputs[1],
encoder_attentions=encoder_outputs[2],
)
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
...@@ -1115,6 +1137,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1115,6 +1137,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -1129,6 +1152,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1129,6 +1152,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache=None, use_cache=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1138,24 +1162,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1138,24 +1162,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
decoder_past_key_value_states (:obj:`tuple(tuple(tf.Tensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`, `optional`, returned when ``use_cache=True``):
Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -1186,8 +1192,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1186,8 +1192,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
labels = inputs[12] if len(inputs) > 12 else labels return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs." labels = inputs[13] if len(inputs) > 13 else labels
assert len(inputs) <= 14, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs: if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
...@@ -1204,12 +1211,14 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1204,12 +1211,14 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 13, "Too many inputs." assert len(inputs) <= 14, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
...@@ -1261,22 +1270,48 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1261,22 +1270,48 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
training=training, training=training,
) )
# insert decoder past at right place
# to speed up decoding
if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
past = ((encoder_outputs, decoder_outputs[1]),)
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
embed_tokens = self.get_output_embeddings() embed_tokens = self.get_output_embeddings()
logits = embed_tokens(sequence_output, mode="linear") logits = embed_tokens(sequence_output, mode="linear")
decoder_outputs = (logits,) + decoder_outputs[1:]
if labels is not None: loss = None if labels is None else self.compute_loss(labels, logits)
loss = self.compute_loss(labels, logits)
decoder_outputs = (loss,) + decoder_outputs
return decoder_outputs + encoder_outputs past = (
(encoder_outputs, decoder_outputs[1]) if cast_bool_to_primitive(use_cache, self.config.use_cache) else None
)
if not return_dict:
if past is not None:
decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:]
output = (logits,) + decoder_outputs[1:] + encoder_outputs
return ((loss,) + output) if loss is not None else output
# Putting this before breaks tf compilation.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# This is long and annoying but if we introduce return_dict at the TFT5MainLayer level (like in PyTorch)
# TF refuses to compile anymore.
if not cast_bool_to_primitive(use_cache, self.config.use_cache):
decoder_outputs = decoder_outputs[:1] + (None,) + decoder_outputs[1:]
if not cast_bool_to_primitive(output_hidden_states, self.config.output_hidden_states):
encoder_outputs = encoder_outputs[:1] + (None,) + encoder_outputs[1:]
decoder_outputs = decoder_outputs[:2] + (None,) + decoder_outputs[2:]
if not cast_bool_to_primitive(output_attentions, self.config.output_attentions):
encoder_outputs = encoder_outputs + (None,)
decoder_outputs = decoder_outputs + (None,)
return TFSeq2SeqLMOutput(
loss=loss,
logits=logits,
decoder_past_key_values=past,
decoder_hidden_states=decoder_outputs[2],
decoder_attentions=decoder_outputs[3],
encoder_last_hidden_state=encoder_outputs[0],
encoder_hidden_states=encoder_outputs[1],
encoder_attentions=encoder_outputs[2],
)
def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs): def prepare_inputs_for_generation(self, inputs, past, attention_mask, use_cache, **kwargs):
assert past is not None, "past has to be defined for encoder_outputs" assert past is not None, "past has to be defined for encoder_outputs"
......
...@@ -18,11 +18,13 @@ ...@@ -18,11 +18,13 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import tensorflow as tf import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import ModelOutput, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -30,6 +32,7 @@ from .tokenization_utils import BatchEncoding ...@@ -30,6 +32,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "TransfoXLConfig"
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer" _TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -388,6 +391,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -388,6 +391,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
self.n_token = config.vocab_size self.n_token = config.vocab_size
...@@ -525,6 +529,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -525,6 +529,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -533,8 +538,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -533,8 +538,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_hidden_states = inputs[5] if len(inputs) > 4 else output_hidden_states output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs[6] if len(inputs) > 6 else return_dict
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems) mems = inputs.get("mems", mems)
...@@ -542,12 +548,14 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -542,12 +548,14 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 7, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
...@@ -606,7 +614,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -606,7 +614,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
# word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] # word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
hids = [] hids = []
attentions = [] attentions = [] if output_attentions else None
if self.attn_type == 0: # default if self.attn_type == 0: # default
pos_seq = tf.range(klen - 1, -1, -1.0) pos_seq = tf.range(klen - 1, -1, -1.0)
if self.clamp_len > 0: if self.clamp_len > 0:
...@@ -633,17 +641,24 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -633,17 +641,24 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
new_mems = self._update_mems(hids, mems, mlen, qlen) new_mems = self._update_mems(hids, mems, mlen, qlen)
# We transpose back here to shape [bsz, len, hidden_dim] # We transpose back here to shape [bsz, len, hidden_dim]
outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems] core_out = tf.transpose(core_out, perm=(1, 0, 2))
if output_hidden_states: if output_hidden_states:
# Add last layer and transpose to library standard shape [bsz, len, hidden_dim] # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
hids.append(core_out) hids.append(core_out)
hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids) hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
outputs.append(hids) else:
hids = None
if output_attentions: if output_attentions:
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs.append(attentions)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) if not return_dict:
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TFTransfoXLModelOutput(
last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions,
)
class TFTransfoXLPreTrainedModel(TFPreTrainedModel): class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
...@@ -655,6 +670,70 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): ...@@ -655,6 +670,70 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFTransfoXLModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFTransfoXLLMHeadModelOutput(ModelOutput):
"""
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
Args:
losses (:obj:`tf.Tensor` of shape `(batch_size, sequence_length-1)`, `optional`, returned when ``labels`` is provided)
Language modeling losses (not reduced).
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
prediction_scores: tf.Tensor = None
mems: List[tf.Tensor] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
TRANSFO_XL_START_DOCSTRING = r""" TRANSFO_XL_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -706,6 +785,11 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -706,6 +785,11 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -719,29 +803,13 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -719,29 +803,13 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TFTransfoXLModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -797,57 +865,47 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -797,57 +865,47 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz) return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="transfo-xl-wt103",
output_type=TFTransfoXLLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
mems=None, mems=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None,
training=False, training=False,
): ):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
labels = inputs[4] if len(inputs) > 4 else labels output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
output_attentions = inputs[5] if len(inputs) > 5 else output_attentions output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs." return_dict = inputs[6] if len(inputs) > 6 else return_dict
labels = inputs[7] if len(inputs) > 7 else labels
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (BatchEncoding, dict)): elif isinstance(inputs, (BatchEncoding, dict)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems) mems = inputs.get("mems", mems)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
labels = inputs.get("labels", labels)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 6, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels)
assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
bsz, tgt_len = shape_list(input_ids)[:2] bsz, tgt_len = shape_list(input_ids)[:2]
...@@ -855,17 +913,30 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -855,17 +913,30 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
bsz, tgt_len = shape_list(inputs_embeds)[:2] bsz, tgt_len = shape_list(inputs_embeds)[:2]
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, mems, head_mask, inputs_embeds, output_attentions, output_hidden_states, training=training input_ids,
mems,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict,
training=training,
) )
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
outputs = transformer_outputs[1:]
softmax_output = self.crit(pred_hid, labels, training=training) softmax_output = self.crit(pred_hid, labels, training=training)
outputs = [softmax_output] + outputs
return outputs # logits, new_mems, (all hidden states), (all attentions) if not return_dict:
return (softmax_output,) + transformer_outputs[1:]
return TFTransfoXLLMHeadModelOutput(
prediction_scores=softmax_output,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
def prepare_inputs_for_generation(self, inputs, past, **model_kwargs): def prepare_inputs_for_generation(self, inputs, past, **model_kwargs):
inputs = {"inputs": inputs} inputs = {"inputs": inputs}
......
...@@ -20,6 +20,8 @@ import itertools ...@@ -20,6 +20,8 @@ import itertools
import logging import logging
import math import math
import warnings import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
...@@ -27,10 +29,18 @@ import tensorflow as tf ...@@ -27,10 +29,18 @@ import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_outputs import (
TFBaseModelOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -48,6 +58,7 @@ from .tokenization_utils import BatchEncoding ...@@ -48,6 +58,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLMConfig"
_TOKENIZER_FOR_DOC = "XLMTokenizer" _TOKENIZER_FOR_DOC = "XLMTokenizer"
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -224,6 +235,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -224,6 +235,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.use_return_dict
# encoder / decoder, output layer # encoder / decoder, output layer
self.is_encoder = config.is_encoder self.is_encoder = config.is_encoder
...@@ -340,6 +352,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -340,6 +352,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): # removed: src_enc=None, src_len=None ): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -354,7 +367,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -354,7 +367,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -367,12 +381,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -367,12 +381,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 11, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -454,8 +470,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -454,8 +470,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
tensor = tensor * mask[..., tf.newaxis] tensor = tensor * mask[..., tf.newaxis]
# transformer layers # transformer layers
hidden_states = () hidden_states = () if output_hidden_states else None
attentions = () attentions = () if output_attentions else None
for i in range(self.n_layers): for i in range(self.n_layers):
if output_hidden_states: if output_hidden_states:
hidden_states = hidden_states + (tensor,) hidden_states = hidden_states + (tensor,)
...@@ -494,12 +510,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -494,12 +510,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
# move back sequence length to dimension 0 # move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) if not return_dict:
if output_hidden_states: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
outputs = outputs + (hidden_states,) return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
if output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
class TFXLMPreTrainedModel(TFPreTrainedModel): class TFXLMPreTrainedModel(TFPreTrainedModel):
...@@ -522,6 +535,33 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): ...@@ -522,6 +535,33 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
# Remove when XLMWithLMHead computes loss like other LM models
@dataclass
class TFXLMWithLMHeadModelOutput(ModelOutput):
"""
Base class for :class:`~transformers.TFXLMWithLMHeadModel` outputs.
Args:
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
logits: tf.Tensor = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
XLM_START_DOCSTRING = r""" XLM_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -603,6 +643,11 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -603,6 +643,11 @@ XLM_INPUTS_DOCSTRING = r"""
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -616,25 +661,13 @@ class TFXLMModel(TFXLMPreTrainedModel): ...@@ -616,25 +661,13 @@ class TFXLMModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFBaseModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -701,32 +734,26 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -701,32 +734,26 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
return {"inputs": inputs, "langs": langs} return {"inputs": inputs, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFXLMWithLMHeadModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" return_dict = kwargs.get("return_dict")
Return: return_dict = return_dict if return_dict is not None else self.transformer.return_dict
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output) outputs = self.pred_layer(output)
outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs if not return_dict:
return (outputs,) + transformer_outputs[1:]
return TFXLMWithLMHeadModelOutput(
logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
)
@add_start_docstrings( @add_start_docstrings(
...@@ -743,7 +770,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -743,7 +770,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -757,6 +789,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -757,6 +789,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -766,27 +799,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -766,27 +799,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -802,19 +820,25 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -802,19 +820,25 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -845,7 +869,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -845,7 +869,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
} }
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -859,6 +888,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -859,6 +888,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -867,24 +897,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -867,24 +897,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -898,8 +910,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -898,8 +910,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
labels = inputs[11] if len(inputs) > 11 else labels return_dict = inputs[11] if len(inputs) > 11 else return_dict
assert len(inputs) <= 12, "Too many inputs." labels = inputs[12] if len(inputs) > 12 else labels
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -912,10 +925,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -912,10 +925,12 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 12, "Too many inputs." assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -955,19 +970,26 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -955,19 +970,26 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
logits = self.logits_proj(logits) logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None: loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs if not return_dict:
output = (reshaped_logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return TFMultipleChoiceModelOutput(
loss=loss,
logits=reshaped_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -987,7 +1009,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -987,7 +1009,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1001,6 +1028,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1001,6 +1028,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1008,27 +1036,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1008,27 +1036,12 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1044,6 +1057,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1044,6 +1057,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1052,13 +1066,18 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1052,13 +1066,18 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1075,7 +1094,12 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1075,7 +1094,12 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlm-mlm-en-2048",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1089,6 +1113,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1089,6 +1113,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
...@@ -1102,30 +1127,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1102,30 +1127,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[11] if len(inputs) > 11 else start_positions start_positions = inputs[12] if len(inputs) > 12 else start_positions
end_positions = inputs[12] if len(inputs) > 12 else end_positions end_positions = inputs[13] if len(inputs) > 13 else end_positions
if len(inputs) > 11: if len(inputs) > 12:
inputs = inputs[:11] inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
...@@ -1142,6 +1150,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1142,6 +1150,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1152,14 +1161,20 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1152,14 +1161,20 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[ loss = None
1:
] # Keep mems, hidden states, attentions if there are in it
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
if not return_dict:
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
...@@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r""" ...@@ -62,8 +62,6 @@ XLM_ROBERTA_START_DOCSTRING = r"""
config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration. model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
""" """
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
import logging import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
...@@ -25,9 +27,11 @@ import tensorflow as tf ...@@ -25,9 +27,11 @@ import tensorflow as tf
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import ( from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS,
ModelOutput,
add_code_sample_docstrings, add_code_sample_docstrings,
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
replace_return_docstrings,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss, TFCausalLanguageModelingLoss,
...@@ -47,6 +51,7 @@ from .tokenization_utils import BatchEncoding ...@@ -47,6 +51,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XLNetConfig"
_TOKENIZER_FOR_DOC = "XLNetTokenizer" _TOKENIZER_FOR_DOC = "XLNetTokenizer"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
...@@ -436,6 +441,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -436,6 +441,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.return_dict = config.return_dict
self.mem_len = config.mem_len self.mem_len = config.mem_len
self.reuse_len = config.reuse_len self.reuse_len = config.reuse_len
...@@ -586,6 +592,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -586,6 +592,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -601,7 +608,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -601,7 +608,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -615,12 +623,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -615,12 +623,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 12, "Too many inputs." return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 13, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
...@@ -743,8 +753,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -743,8 +753,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if mems is None: if mems is None:
mems = [None] * len(self.layer) mems = [None] * len(self.layer)
attentions = [] attentions = [] if output_attentions else None
hidden_states = [] hidden_states = [] if output_hidden_states else None
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
# cache new mems # cache new mems
if self.mem_len is not None and self.mem_len > 0 and use_cache: if self.mem_len is not None and self.mem_len > 0 and use_cache:
...@@ -776,22 +786,24 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -776,22 +786,24 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
output = self.dropout(output_g if output_g is not None else output_h, training=training) output = self.dropout(output_g if output_g is not None else output_h, training=training)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs = (tf.transpose(output, perm=(1, 0, 2)),) output = tf.transpose(output, perm=(1, 0, 2))
if self.mem_len is not None and self.mem_len > 0 and use_cache:
outputs = outputs + (new_mems,)
if not (self.mem_len is not None and self.mem_len > 0 and use_cache):
new_mems = None
if output_hidden_states: if output_hidden_states:
if output_g is not None: if output_g is not None:
hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs) hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
else: else:
hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states) hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
outputs = outputs + (hidden_states,)
if output_attentions: if output_attentions:
attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions) attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, (new_mems), (hidden_states), (attentions) if not return_dict:
return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
return TFXLNetModelOutput(
last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
)
class TFXLNetPreTrainedModel(TFPreTrainedModel): class TFXLNetPreTrainedModel(TFPreTrainedModel):
...@@ -803,6 +815,218 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): ...@@ -803,6 +815,218 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel):
base_model_prefix = "transformer" base_model_prefix = "transformer"
@dataclass
class TFXLNetModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetModel`.
Args:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
last_hidden_state: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetLMHeadModelOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetLMHeadModel`.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
Language modeling loss (for next-token prediction).
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_predict, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
``num_predict`` corresponds to ``target_mapping.shape[1]``. If ``target_mapping`` is ``None``, then
``num_predict`` corresponds to ``sequence_length``.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForSequenceClassificationOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForSequenceClassification`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForTokenClassificationOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForTokenClassificationOutput`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForMultipleChoiceOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForMultipleChoice`.
Args:
loss (:obj:`tf.Tensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss.
logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
@dataclass
class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
"""
Output type of :class:`~transformers.TFXLNetForQuestionAnsweringSimple`.
Args:
loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states.
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
loss: Optional[tf.Tensor] = None
start_logits: tf.Tensor = None
end_logits: tf.Tensor = None
mems: Optional[List[tf.Tensor]] = None
hidden_states: Optional[Tuple[tf.Tensor]] = None
attentions: Optional[Tuple[tf.Tensor]] = None
XLNET_START_DOCSTRING = r""" XLNET_START_DOCSTRING = r"""
.. note:: .. note::
...@@ -885,6 +1109,11 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -885,6 +1109,11 @@ XLNET_INPUTS_DOCSTRING = r"""
If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`. If `use_cache` is True, `mems` are returned and can be used to speed up decoding (see `mems`). Defaults to `True`.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -898,29 +1127,13 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -898,29 +1127,13 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model.
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -980,6 +1193,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -980,6 +1193,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
return inputs return inputs
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
def call( def call(
self, self,
inputs, inputs,
...@@ -994,6 +1208,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -994,6 +1208,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1003,24 +1218,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1003,24 +1218,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples:: Examples::
...@@ -1045,10 +1242,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1045,10 +1242,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1065,21 +1263,30 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1065,21 +1263,30 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=return_dict,
training=training, training=training,
) )
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state, training=training) logits = self.lm_loss(hidden_state, training=training)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None
if labels is not None: if labels is not None:
# shift labels to the left and cut last logit token # shift labels to the left and cut last logit token
logits = logits[:, :-1] logits = logits[:, :-1]
labels = labels[:, 1:] labels = labels[:, 1:]
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # return logits, (mems), (hidden states), (attentions) if not return_dict:
output = (logits,) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFXLNetLMHeadModelOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1101,7 +1308,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1101,7 +1308,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForSequenceClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1116,6 +1328,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1116,6 +1328,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1125,31 +1338,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1125,31 +1338,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1166,19 +1360,26 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1166,19 +1360,26 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
logits = self.logits_proj(output) logits = self.logits_proj(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFXLNetForSequenceClassificationOutput(
loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1208,7 +1409,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1208,7 +1409,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForMultipleChoiceOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1223,6 +1429,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1223,6 +1429,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1231,24 +1438,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1231,24 +1438,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1263,8 +1452,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1263,8 +1452,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache = inputs[9] if len(inputs) > 9 else use_cache use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
labels = inputs[12] if len(inputs) > 12 else labels return_dict = inputs[12] if len(inputs) > 12 else return_dict
assert len(inputs) <= 13, "Too many inputs." labels = inputs[13] if len(inputs) > 13 else labels
assert len(inputs) <= 14, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -1278,10 +1468,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1278,10 +1468,12 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 13, "Too many inputs." assert len(inputs) <= 14, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -1312,19 +1504,26 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1312,19 +1504,26 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
use_cache, use_cache,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
logits = self.logits_proj(logits) logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, reshaped_logits) output = (reshaped_logits,) + transformer_outputs[1:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (mems), (hidden states), (attentions) return TFXLNetForMultipleChoiceOutput(
loss=loss,
logits=reshaped_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1343,7 +1542,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1343,7 +1542,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForTokenClassificationOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1358,6 +1562,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1358,6 +1562,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -1365,31 +1570,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1365,31 +1570,12 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
Classification scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[13] if len(inputs) > 13 else labels
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -1406,19 +1592,25 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1406,19 +1592,25 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
output = transformer_outputs[0]
output = transformer_outputs[0]
logits = self.classifier(output) logits = self.classifier(output)
loss = None if labels is None else self.compute_loss(labels, logits)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
if not return_dict:
if labels is not None: output = (logits,) + transformer_outputs[1:]
loss = self.compute_loss(labels, logits) return ((loss,) + output) if loss is not None else output
outputs = (loss,) + outputs
return TFXLNetForTokenClassificationOutput(
return outputs # (loss), logits, (hidden_states), (attentions) loss=loss,
logits=logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -1435,7 +1627,12 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1435,7 +1627,12 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xlnet-base-cased",
output_type=TFXLNetForQuestionAnsweringSimpleOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1450,6 +1647,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1450,6 +1647,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
...@@ -1463,36 +1661,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1463,36 +1661,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`):
Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[12] if len(inputs) > 12 else start_positions start_positions = inputs[13] if len(inputs) > 13 else start_positions
end_positions = inputs[13] if len(inputs) > 13 else end_positions end_positions = inputs[14] if len(inputs) > 14 else end_positions
if len(inputs) > 12: if len(inputs) > 13:
inputs = inputs[:12] inputs = inputs[:13]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
...@@ -1510,6 +1685,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1510,6 +1685,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -1520,17 +1696,24 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1520,17 +1696,24 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[ loss = None
1:
] # Keep mems, hidden states, attentions if there are in it
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
if not return_dict:
return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) output = (start_logits, end_logits) + transformer_outputs[1:]
return ((loss,) + output) if loss is not None else output
return TFXLNetForQuestionAnsweringSimpleOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
mems=transformer_outputs.mems,
hidden_states=transformer_outputs.hidden_states,
attentions=transformer_outputs.attentions,
)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
......
...@@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput): ...@@ -711,7 +711,7 @@ class XLNetForTokenClassificationOutput(ModelOutput):
@dataclass @dataclass
class XLNetForMultipleChoiceOutput(ModelOutput): class XLNetForMultipleChoiceOutput(ModelOutput):
""" """
Base class for outputs of multiple choice models. Output type of :class:`~transformers.XLNetForMultipleChoice`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
...@@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput): ...@@ -747,7 +747,7 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
""" """
Base class for outputs of question answering models. Output type of :class:`~transformers.XLNetForQuestionAnsweringSimple`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
...@@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): ...@@ -784,7 +784,7 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
@dataclass @dataclass
class XLNetForQuestionAnsweringOutput(ModelOutput): class XLNetForQuestionAnsweringOutput(ModelOutput):
""" """
Base class for outputs of question answering models using a :obj:`SquadHead`. Output type of :class:`~transformers.XLNetForQuestionAnswering`.
Args: Args:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
...@@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -1227,7 +1227,6 @@ class XLNetModel(XLNetPreTrainedModel):
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
output = output.permute(1, 0, 2).contiguous() output = output.permute(1, 0, 2).contiguous()
# TODO Teven: fix this test to only use use_cache.
if not use_cache: if not use_cache:
new_mems = None new_mems = None
......
...@@ -31,6 +31,14 @@ from .file_utils import ( ...@@ -31,6 +31,14 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_outputs import (
TFBaseModelOutputWithPooling,
TFMaskedLMOutput,
TFMultipleChoiceModelOutput,
TFQuestionAnsweringModelOutput,
TFSequenceClassifierOutput,
TFTokenClassifierOutput,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss, TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding ...@@ -46,6 +54,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_CONFIG_FOR_DOC = "XXXConfig"
_TOKENIZER_FOR_DOC = "XxxTokenizer" _TOKENIZER_FOR_DOC = "XxxTokenizer"
#################################################### ####################################################
...@@ -117,35 +126,60 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -117,35 +126,60 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models
def call( def call(
self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
training=False,
): ):
# We allow three types of multi-inputs:
# - traditional keyword arguments in the call method
# - all the arguments provided as a dict in the first positional argument of call
# - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
# The last two options are useful to use the tf.keras fit() method.
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
elif isinstance(inputs, dict): output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids) token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids) position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
assert len(inputs) <= 5, "Too many inputs." inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None: if attention_mask is None:
attention_mask = tf.fill(shape_list(input_ids), 1) attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None: if token_type_ids is None:
token_type_ids = tf.fill(shape_list(input_ids), 0) token_type_ids = tf.fill(input_shape, 0)
# We create a 3D attention mask from a 2D tensor mask. # We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length] # Sizes are [batch_size, 1, 1, to_seq_length]
...@@ -174,14 +208,29 @@ class TFXxxMainLayer(tf.keras.layers.Layer): ...@@ -174,14 +208,29 @@ class TFXxxMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.num_hidden_layers head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers) # head_mask = tf.constant([0] * self.num_hidden_layers)
################################## embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
# Replace this with your model code encoder_outputs = self.encoder(
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) embedding_output,
encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) extended_attention_mask,
head_mask,
output_attentions,
output_hidden_states,
return_dict,
training=training,
)
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here pooled_output = self.pooler(sequence_output)
if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:]
return outputs # sequence_output, (hidden_states), (attentions) return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
#################################################### ####################################################
...@@ -274,6 +323,11 @@ XXX_INPUTS_DOCSTRING = r""" ...@@ -274,6 +323,11 @@ XXX_INPUTS_DOCSTRING = r"""
(if set to :obj:`False`) for evaluation. (if set to :obj:`False`) for evaluation.
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
plain tuple.
""" """
...@@ -287,32 +341,13 @@ class TFXxxModel(TFXxxPreTrainedModel): ...@@ -287,32 +341,13 @@ class TFXxxModel(TFXxxPreTrainedModel):
self.transformer = TFXxxMainLayer(config, name="transformer") self.transformer = TFXxxMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFBaseModelOutputWithPooling,
config_class=_CONFIG_FOR_DOC,
)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during XXX pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -329,7 +364,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -329,7 +364,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm")
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFMaskedLMOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -340,6 +380,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -340,6 +380,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -349,27 +390,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -349,27 +390,12 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -382,19 +408,22 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -382,19 +408,22 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=training) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, prediction_scores)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, prediction_scores) output = (prediction_scores,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), prediction_scores, (hidden_states), (attentions) return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -414,7 +443,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -414,7 +443,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -425,6 +459,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -425,6 +459,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -434,27 +469,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -434,27 +469,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -467,6 +487,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -467,6 +487,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -475,13 +496,15 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat ...@@ -475,13 +496,15 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificat
pooled_output = self.dropout(pooled_output, training=training) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -509,7 +532,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -509,7 +532,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFMultipleChoiceModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs, inputs,
...@@ -520,6 +548,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -520,6 +548,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -527,24 +556,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -527,24 +556,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)s after the attention softmax, used to compute the weighted average in the self-attention
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -556,8 +568,9 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -556,8 +568,9 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
labels = inputs[8] if len(inputs) > 8 else labels return_dict = inputs[8] if len(inputs) > 8 else return_dict
assert len(inputs) <= 9, "Too many inputs." labels = inputs[9] if len(inputs) > 9 else labels
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -567,10 +580,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -567,10 +580,12 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
return_dict = inputs.get("return_dict", return_dict)
labels = inputs.get("labels", labels) labels = inputs.get("labels", labels)
assert len(inputs) <= 9, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if input_ids is not None: if input_ids is not None:
num_choices = shape_list(input_ids)[1] num_choices = shape_list(input_ids)[1]
...@@ -598,6 +613,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -598,6 +613,7 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states, output_hidden_states,
return_dict,
] ]
outputs = self.transformer(flat_inputs, training=training) outputs = self.transformer(flat_inputs, training=training)
...@@ -608,13 +624,15 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): ...@@ -608,13 +624,15 @@ class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss):
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices)) reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, reshaped_logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, reshaped_logits) output = (reshaped_logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), reshaped_logits, (hidden_states), (attentions) return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -634,7 +652,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -634,7 +652,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFTokenClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -645,6 +668,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -645,6 +668,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
labels=None, labels=None,
training=False, training=False,
): ):
...@@ -652,27 +676,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -652,27 +676,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[9] if len(inputs) > 9 else labels
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels) labels = inputs.pop("labels", labels)
...@@ -685,6 +694,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -685,6 +694,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -693,13 +703,15 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos ...@@ -693,13 +703,15 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here loss = None if labels is None else self.compute_loss(labels, logits)
if labels is not None: if not return_dict:
loss = self.compute_loss(labels, logits) output = (logits,) + outputs[2:]
outputs = (loss,) + outputs return ((loss,) + output) if loss is not None else output
return outputs # (loss), logits, (hidden_states), (attentions) return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions,
)
@add_start_docstrings( @add_start_docstrings(
...@@ -718,7 +730,12 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -718,7 +730,12 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
) )
@add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XXX_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xxx-base-cased") @add_code_sample_docstrings(
tokenizer_class=_TOKENIZER_FOR_DOC,
checkpoint="xxx-base-cased",
output_type=TFQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -729,6 +746,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -729,6 +746,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
return_dict=None,
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
training=False, training=False,
...@@ -742,30 +760,13 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -742,30 +760,13 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XxxConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
return_dict = return_dict if return_dict is not None else self.transformer.return_dict
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[9] if len(inputs) > 9 else start_positions
end_positions = inputs[9] if len(inputs) > 9 else end_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions
if len(inputs) > 8: if len(inputs) > 9:
inputs = inputs[:8] inputs = inputs[:9]
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
start_positions = inputs.pop("start_positions", start_positions) start_positions = inputs.pop("start_positions", start_positions)
end_positions = inputs.pop("end_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions)
...@@ -779,6 +780,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -779,6 +780,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states, output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training, training=training,
) )
...@@ -789,12 +791,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): ...@@ -789,12 +791,20 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss):
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + outputs[2:] loss = None
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions} labels = {"start_position": start_positions}
labels["end_position"] = end_positions labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2]) loss = self.compute_loss(labels, (start_logits, end_logits))
outputs = (loss,) + outputs
if not return_dict:
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) output = (start_logits, end_logits) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput(
loss=loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
...@@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow ...@@ -24,9 +24,11 @@ from .utils import CACHE_DIR, require_tf, slow
if is_tf_available(): if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_xxx import ( from transformers.modeling_tf_xxx import (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
...@@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -40,6 +42,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
( (
TFXxxModel, TFXxxModel,
TFXxxForMaskedLM, TFXxxForMaskedLM,
TFXxxForMultipleChoice,
TFXxxForQuestionAnswering, TFXxxForQuestionAnswering,
TFXxxForSequenceClassification, TFXxxForSequenceClassification,
TFXxxForTokenClassification, TFXxxForTokenClassification,
...@@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -128,6 +131,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -137,33 +141,26 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxModel(config=config) model = TFXxxModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_xxx_for_masked_lm( def create_and_check_xxx_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFXxxForMaskedLM(config=config) model = TFXxxForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
def create_and_check_xxx_for_sequence_classification( def create_and_check_xxx_for_sequence_classification(
...@@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -172,22 +169,32 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForSequenceClassification(config=config) model = TFXxxForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.num_choices = self.num_choices
model = TFXxxForMultipleChoice(config=config)
multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
inputs = {
"input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids,
}
result = model(inputs)
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_xxx_for_token_classification( def create_and_check_xxx_for_token_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFXxxForTokenClassification(config=config) model = TFXxxForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
) )
...@@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -197,11 +204,7 @@ class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase):
): ):
model = TFXxxForQuestionAnswering(config=config) model = TFXxxForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
......
...@@ -116,6 +116,7 @@ class TFAlbertModelTester: ...@@ -116,6 +116,7 @@ class TFAlbertModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -129,21 +130,17 @@ class TFAlbertModelTester: ...@@ -129,21 +130,17 @@ class TFAlbertModelTester:
# 'token_type_ids': token_type_ids} # 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs) # sequence_output, pooled_output = model(**inputs)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output, pooled_output = model(inputs) result = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_albert_for_pretraining( def create_and_check_albert_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -151,28 +148,19 @@ class TFAlbertModelTester: ...@@ -151,28 +148,19 @@ class TFAlbertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFAlbertForPreTraining(config=config) model = TFAlbertForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores, sop_scores = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
"sop_scores": sop_scores.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
self.parent.assertListEqual(list(result["sop_scores"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["sop_logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_albert_for_masked_lm( def create_and_check_albert_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFAlbertForMaskedLM(config=config) model = TFAlbertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_albert_for_sequence_classification( def create_and_check_albert_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -180,10 +168,7 @@ class TFAlbertModelTester: ...@@ -180,10 +168,7 @@ class TFAlbertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFAlbertForSequenceClassification(config=config) model = TFAlbertForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_albert_for_question_answering( def create_and_check_albert_for_question_answering(
...@@ -191,11 +176,7 @@ class TFAlbertModelTester: ...@@ -191,11 +176,7 @@ class TFAlbertModelTester:
): ):
model = TFAlbertForQuestionAnswering(config=config) model = TFAlbertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
start_logits, end_logits = model(inputs) result = model(inputs)
result = {
"start_logits": start_logits.numpy(),
"end_logits": end_logits.numpy(),
}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
......
...@@ -118,6 +118,7 @@ class TFBertModelTester: ...@@ -118,6 +118,7 @@ class TFBertModelTester:
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -130,18 +131,14 @@ class TFBertModelTester: ...@@ -130,18 +131,14 @@ class TFBertModelTester:
sequence_output, pooled_output = model(inputs) sequence_output, pooled_output = model(inputs)
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs) result = model(inputs)
sequence_output, pooled_output = model(input_ids) result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooler_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_bert_lm_head( def create_and_check_bert_lm_head(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -153,7 +150,7 @@ class TFBertModelTester: ...@@ -153,7 +150,7 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(prediction_scores,) = model(inputs) prediction_scores = model(inputs)["logits"]
self.parent.assertListEqual( self.parent.assertListEqual(
list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size] list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
...@@ -167,39 +164,27 @@ class TFBertModelTester: ...@@ -167,39 +164,27 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(prediction_scores,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_bert_for_next_sequence_prediction( def create_and_check_bert_for_next_sequence_prediction(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForNextSentencePrediction(config=config) model = TFBertForNextSentencePrediction(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
(seq_relationship_score,) = model(inputs) result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, 2])
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2])
def create_and_check_bert_for_pretraining( def create_and_check_bert_for_pretraining(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForPreTraining(config=config) model = TFBertForPreTraining(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores, seq_relationship_score = model(inputs) result = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
"seq_relationship_score": seq_relationship_score.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] list(result["prediction_logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]
) )
self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) self.parent.assertListEqual(list(result["seq_relationship_logits"].shape), [self.batch_size, 2])
def create_and_check_bert_for_sequence_classification( def create_and_check_bert_for_sequence_classification(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -212,8 +197,7 @@ class TFBertModelTester: ...@@ -212,8 +197,7 @@ class TFBertModelTester:
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice( def create_and_check_bert_for_multiple_choice(
...@@ -229,8 +213,7 @@ class TFBertModelTester: ...@@ -229,8 +213,7 @@ class TFBertModelTester:
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_bert_for_token_classification( def create_and_check_bert_for_token_classification(
...@@ -243,10 +226,7 @@ class TFBertModelTester: ...@@ -243,10 +226,7 @@ class TFBertModelTester:
"attention_mask": input_mask, "attention_mask": input_mask,
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def create_and_check_bert_for_question_answering( def create_and_check_bert_for_question_answering(
...@@ -259,8 +239,7 @@ class TFBertModelTester: ...@@ -259,8 +239,7 @@ class TFBertModelTester:
"token_type_ids": token_type_ids, "token_type_ids": token_type_ids,
} }
start_logits, end_logits = model(inputs) result = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
......
...@@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase): ...@@ -35,7 +35,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
[[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32, [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], dtype=tf.int32,
) # J'aime le camembert !" ) # J'aime le camembert !"
output = model(input_ids)[0] output = model(input_ids)["last_hidden_state"]
expected_shape = tf.TensorShape((1, 10, 768)) expected_shape = tf.TensorShape((1, 10, 768))
self.assertEqual(output.shape, expected_shape) self.assertEqual(output.shape, expected_shape)
# compare the actual values for a slice. # compare the actual values for a slice.
......
...@@ -146,7 +146,8 @@ class TFModelTesterMixin: ...@@ -146,7 +146,8 @@ class TFModelTesterMixin:
tf.saved_model.save(model, tmpdirname) tf.saved_model.save(model, tmpdirname)
model = tf.keras.models.load_model(tmpdirname) model = tf.keras.models.load_model(tmpdirname)
outputs = model(inputs_dict) outputs = model(inputs_dict)
hidden_states = [t.numpy() for t in outputs[-1]] output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1]
hidden_states = [t.numpy() for t in output]
self.assertEqual(len(outputs), num_out) self.assertEqual(len(outputs), num_out)
self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
self.assertListEqual( self.assertListEqual(
...@@ -177,7 +178,8 @@ class TFModelTesterMixin: ...@@ -177,7 +178,8 @@ class TFModelTesterMixin:
tf.saved_model.save(model, tmpdirname) tf.saved_model.save(model, tmpdirname)
model = tf.keras.models.load_model(tmpdirname) model = tf.keras.models.load_model(tmpdirname)
outputs = model(inputs_dict) outputs = model(inputs_dict)
attentions = [t.numpy() for t in outputs[-1]] output = outputs[list(outputs.keys())[-1]] if isinstance(outputs, dict) else outputs[-1]
attentions = [t.numpy() for t in output]
self.assertEqual(len(outputs), num_out) self.assertEqual(len(outputs), num_out)
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
self.assertListEqual( self.assertListEqual(
...@@ -238,6 +240,8 @@ class TFModelTesterMixin: ...@@ -238,6 +240,8 @@ class TFModelTesterMixin:
# Make sure we don't have nans # Make sure we don't have nans
if isinstance(after_outputs, tf.Tensor): if isinstance(after_outputs, tf.Tensor):
out_1 = after_outputs.numpy() out_1 = after_outputs.numpy()
elif isinstance(after_outputs, dict):
out_1 = after_outputs[list(after_outputs.keys())[0]]
else: else:
out_1 = after_outputs[0].numpy() out_1 = after_outputs[0].numpy()
out_2 = outputs[0].numpy() out_2 = outputs[0].numpy()
......
...@@ -89,9 +89,10 @@ class TFCTRLModelTester(object): ...@@ -89,9 +89,10 @@ class TFCTRLModelTester(object):
# hidden_dropout_prob=self.hidden_dropout_prob, # hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings, n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings n_ctx=self.max_position_embeddings,
# type_vocab_size=self.type_vocab_size, # type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range # initializer_range=self.initializer_range,
return_dict=True,
) )
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
...@@ -111,30 +112,22 @@ class TFCTRLModelTester(object): ...@@ -111,30 +112,22 @@ class TFCTRLModelTester(object):
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFCTRLModel(config=config) model = TFCTRLModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
sequence_output = model(inputs)[0] result = model(inputs)
inputs = [input_ids, None, input_mask] # None is the input for 'past' inputs = [input_ids, None, input_mask] # None is the input for 'past'
sequence_output = model(inputs)[0] result = model(inputs)
sequence_output = model(input_ids)[0] result = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = TFCTRLLMHeadModel(config=config) model = TFCTRLLMHeadModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
prediction_scores = model(inputs)[0] result = model(inputs)
result = { self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()
......
...@@ -89,6 +89,7 @@ class TFDistilBertModelTester: ...@@ -89,6 +89,7 @@ class TFDistilBertModelTester:
attention_dropout=self.attention_probs_dropout_prob, attention_dropout=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings, max_position_embeddings=self.max_position_embeddings,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,
return_dict=True,
) )
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -99,18 +100,14 @@ class TFDistilBertModelTester: ...@@ -99,18 +100,14 @@ class TFDistilBertModelTester:
model = TFDistilBertModel(config=config) model = TFDistilBertModel(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
outputs = model(inputs) result = model(inputs)
sequence_output = outputs[0]
inputs = [input_ids, input_mask] inputs = [input_ids, input_mask]
(sequence_output,) = model(inputs) result = model(inputs)
result = {
"sequence_output": sequence_output.numpy(),
}
self.parent.assertListEqual( self.parent.assertListEqual(
list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] list(result["last_hidden_state"].shape), [self.batch_size, self.seq_length, self.hidden_size]
) )
def create_and_check_distilbert_for_masked_lm( def create_and_check_distilbert_for_masked_lm(
...@@ -118,11 +115,8 @@ class TFDistilBertModelTester: ...@@ -118,11 +115,8 @@ class TFDistilBertModelTester:
): ):
model = TFDistilBertForMaskedLM(config=config) model = TFDistilBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(prediction_scores,) = model(inputs) result = model(inputs)
result = {"prediction_scores": prediction_scores.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size])
self.parent.assertListEqual(
list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_distilbert_for_question_answering( def create_and_check_distilbert_for_question_answering(
self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
...@@ -132,8 +126,7 @@ class TFDistilBertModelTester: ...@@ -132,8 +126,7 @@ class TFDistilBertModelTester:
"input_ids": input_ids, "input_ids": input_ids,
"attention_mask": input_mask, "attention_mask": input_mask,
} }
start_logits, end_logits = model(inputs) result = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
...@@ -143,8 +136,7 @@ class TFDistilBertModelTester: ...@@ -143,8 +136,7 @@ class TFDistilBertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFDistilBertForSequenceClassification(config) model = TFDistilBertForSequenceClassification(config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_distilbert_for_multiple_choice( def create_and_check_distilbert_for_multiple_choice(
...@@ -158,8 +150,7 @@ class TFDistilBertModelTester: ...@@ -158,8 +150,7 @@ class TFDistilBertModelTester:
"input_ids": multiple_choice_inputs_ids, "input_ids": multiple_choice_inputs_ids,
"attention_mask": multiple_choice_input_mask, "attention_mask": multiple_choice_input_mask,
} }
(logits,) = model(inputs) result = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_distilbert_for_token_classification( def create_and_check_distilbert_for_token_classification(
...@@ -168,10 +159,7 @@ class TFDistilBertModelTester: ...@@ -168,10 +159,7 @@ class TFDistilBertModelTester:
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFDistilBertForTokenClassification(config) model = TFDistilBertForTokenClassification(config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask} inputs = {"input_ids": input_ids, "attention_mask": input_mask}
(logits,) = model(inputs) result = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
def prepare_config_and_inputs_for_common(self): def prepare_config_and_inputs_for_common(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment