Unverified Commit f4e1f022 authored by Joseph Liu's avatar Joseph Liu Committed by GitHub
Browse files

Output hidden states (#4978)



* Configure all models to use output_hidden_states as argument passed to foward()

* Pass all tests

* Remove cast_bool_to_primitive in TF Flaubert model

* correct tf xlnet

* add pytorch test

* add tf test

* Fix broken tests

* Configure all models to use output_hidden_states as argument passed to foward()

* Pass all tests

* Remove cast_bool_to_primitive in TF Flaubert model

* correct tf xlnet

* add pytorch test

* add tf test

* Fix broken tests

* Refactor output_hidden_states for mobilebert

* Reset and remerge to master
Co-authored-by: default avatarJoseph Liu <joseph.liu@coinflex.com>
Co-authored-by: default avatarpatrickvonplaten <patrick.v.platen@gmail.com>
parent 866a8cca
...@@ -269,7 +269,9 @@ class AlbertLayer(nn.Module): ...@@ -269,7 +269,9 @@ class AlbertLayer(nn.Module):
self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
self.activation = ACT2FN[config.hidden_act] self.activation = ACT2FN[config.hidden_act]
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): def forward(
self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
):
attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions) attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
ffn_output = self.ffn(attention_output[0]) ffn_output = self.ffn(attention_output[0])
ffn_output = self.activation(ffn_output) ffn_output = self.activation(ffn_output)
...@@ -283,10 +285,11 @@ class AlbertLayerGroup(nn.Module): ...@@ -283,10 +285,11 @@ class AlbertLayerGroup(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.output_hidden_states = config.output_hidden_states
self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)])
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): def forward(
self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
):
layer_hidden_states = () layer_hidden_states = ()
layer_attentions = () layer_attentions = ()
...@@ -297,11 +300,11 @@ class AlbertLayerGroup(nn.Module): ...@@ -297,11 +300,11 @@ class AlbertLayerGroup(nn.Module):
if output_attentions: if output_attentions:
layer_attentions = layer_attentions + (layer_output[1],) layer_attentions = layer_attentions + (layer_output[1],)
if self.output_hidden_states: if output_hidden_states:
layer_hidden_states = layer_hidden_states + (hidden_states,) layer_hidden_states = layer_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (layer_hidden_states,) outputs = outputs + (layer_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (layer_attentions,) outputs = outputs + (layer_attentions,)
...@@ -313,16 +316,17 @@ class AlbertTransformer(nn.Module): ...@@ -313,16 +316,17 @@ class AlbertTransformer(nn.Module):
super().__init__() super().__init__()
self.config = config self.config = config
self.output_hidden_states = config.output_hidden_states
self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): def forward(
self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
):
hidden_states = self.embedding_hidden_mapping_in(hidden_states) hidden_states = self.embedding_hidden_mapping_in(hidden_states)
all_attentions = () all_attentions = ()
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = (hidden_states,) all_hidden_states = (hidden_states,)
for i in range(self.config.num_hidden_layers): for i in range(self.config.num_hidden_layers):
...@@ -337,17 +341,18 @@ class AlbertTransformer(nn.Module): ...@@ -337,17 +341,18 @@ class AlbertTransformer(nn.Module):
attention_mask, attention_mask,
head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
output_attentions, output_attentions,
output_hidden_states,
) )
hidden_states = layer_group_output[0] hidden_states = layer_group_output[0]
if output_attentions: if output_attentions:
all_attentions = all_attentions + layer_group_output[-1] all_attentions = all_attentions + layer_group_output[-1]
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -489,6 +494,7 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -489,6 +494,7 @@ class AlbertModel(AlbertPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
...@@ -504,7 +510,9 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -504,7 +510,9 @@ class AlbertModel(AlbertPreTrainedModel):
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -530,6 +538,9 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -530,6 +538,9 @@ class AlbertModel(AlbertPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -556,7 +567,11 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -556,7 +567,11 @@ class AlbertModel(AlbertPreTrainedModel):
input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
) )
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, embedding_output,
extended_attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
...@@ -603,6 +618,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel): ...@@ -603,6 +618,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
labels=None, labels=None,
sentence_order_label=None, sentence_order_label=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs, **kwargs,
): ):
r""" r"""
...@@ -628,7 +644,9 @@ class AlbertForPreTraining(AlbertPreTrainedModel): ...@@ -628,7 +644,9 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax). continuation before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -672,6 +690,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel): ...@@ -672,6 +690,7 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
...@@ -758,6 +777,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -758,6 +777,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -775,7 +795,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -775,7 +795,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -815,6 +837,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -815,6 +837,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_outputs = outputs[0] sequence_outputs = outputs[0]
...@@ -856,6 +879,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -856,6 +879,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -870,7 +894,9 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -870,7 +894,9 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -904,6 +930,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -904,6 +930,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -953,6 +980,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -953,6 +980,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -965,7 +993,9 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -965,7 +993,9 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1001,6 +1031,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -1001,6 +1031,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1052,6 +1083,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1052,6 +1083,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1071,7 +1103,9 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1071,7 +1103,9 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1107,6 +1141,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1107,6 +1141,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1163,6 +1198,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1163,6 +1198,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1178,7 +1214,9 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1178,7 +1214,9 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1228,6 +1266,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1228,6 +1266,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
......
...@@ -248,7 +248,6 @@ class BartEncoder(nn.Module): ...@@ -248,7 +248,6 @@ class BartEncoder(nn.Module):
self.dropout = config.dropout self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop self.layerdrop = config.encoder_layerdrop
self.output_hidden_states = config.output_hidden_states
embed_dim = embed_tokens.embedding_dim embed_dim = embed_tokens.embedding_dim
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
...@@ -269,7 +268,7 @@ class BartEncoder(nn.Module): ...@@ -269,7 +268,7 @@ class BartEncoder(nn.Module):
# mbart has one extra layer_norm # mbart has one extra layer_norm
self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None
def forward(self, input_ids, attention_mask=None, output_attentions=False): def forward(self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False):
""" """
Args: Args:
input_ids (LongTensor): tokens in the source language of shape input_ids (LongTensor): tokens in the source language of shape
...@@ -281,7 +280,7 @@ class BartEncoder(nn.Module): ...@@ -281,7 +280,7 @@ class BartEncoder(nn.Module):
shape `(src_len, batch, embed_dim)` shape `(src_len, batch, embed_dim)`
- **encoder_states** (List[Tensor]): all intermediate - **encoder_states** (List[Tensor]): all intermediate
hidden states of shape `(src_len, batch, embed_dim)`. hidden states of shape `(src_len, batch, embed_dim)`.
Only populated if *self.output_hidden_states:* is True. Only populated if *output_hidden_states:* is True.
- **all_attentions** (List[Tensor]): Attention weights for each layer. - **all_attentions** (List[Tensor]): Attention weights for each layer.
During training might not be of length n_layers because of layer dropout. During training might not be of length n_layers because of layer dropout.
""" """
...@@ -300,7 +299,7 @@ class BartEncoder(nn.Module): ...@@ -300,7 +299,7 @@ class BartEncoder(nn.Module):
encoder_states, all_attentions = [], [] encoder_states, all_attentions = [], []
for encoder_layer in self.layers: for encoder_layer in self.layers:
if self.output_hidden_states: if output_hidden_states:
encoder_states.append(x) encoder_states.append(x)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = random.uniform(0, 1) dropout_probability = random.uniform(0, 1)
...@@ -314,7 +313,7 @@ class BartEncoder(nn.Module): ...@@ -314,7 +313,7 @@ class BartEncoder(nn.Module):
if self.layer_norm: if self.layer_norm:
x = self.layer_norm(x) x = self.layer_norm(x)
if self.output_hidden_states: if output_hidden_states:
encoder_states.append(x) encoder_states.append(x)
# T x B x C -> B x T x C # T x B x C -> B x T x C
...@@ -424,7 +423,6 @@ class BartDecoder(nn.Module): ...@@ -424,7 +423,6 @@ class BartDecoder(nn.Module):
def __init__(self, config: BartConfig, embed_tokens: nn.Embedding): def __init__(self, config: BartConfig, embed_tokens: nn.Embedding):
super().__init__() super().__init__()
self.output_hidden_states = config.output_hidden_states
self.dropout = config.dropout self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop self.layerdrop = config.decoder_layerdrop
self.padding_idx = embed_tokens.padding_idx self.padding_idx = embed_tokens.padding_idx
...@@ -455,6 +453,7 @@ class BartDecoder(nn.Module): ...@@ -455,6 +453,7 @@ class BartDecoder(nn.Module):
decoder_cached_states=None, decoder_cached_states=None,
use_cache=False, use_cache=False,
output_attentions=False, output_attentions=False,
output_hidden_states=False,
**unused, **unused,
): ):
""" """
...@@ -502,7 +501,7 @@ class BartDecoder(nn.Module): ...@@ -502,7 +501,7 @@ class BartDecoder(nn.Module):
next_decoder_cache = [] next_decoder_cache = []
for idx, decoder_layer in enumerate(self.layers): for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if self.output_hidden_states: if output_hidden_states:
all_hidden_states += (x,) all_hidden_states += (x,)
dropout_probability = random.uniform(0, 1) dropout_probability = random.uniform(0, 1)
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
...@@ -797,7 +796,6 @@ def _get_shape(t): ...@@ -797,7 +796,6 @@ def _get_shape(t):
class BartModel(PretrainedBartModel): class BartModel(PretrainedBartModel):
def __init__(self, config: BartConfig): def __init__(self, config: BartConfig):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
padding_idx, vocab_size = config.pad_token_id, config.vocab_size padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
...@@ -818,8 +816,12 @@ class BartModel(PretrainedBartModel): ...@@ -818,8 +816,12 @@ class BartModel(PretrainedBartModel):
decoder_cached_states=None, decoder_cached_states=None,
use_cache=False, use_cache=False,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# make masks if user doesn't supply # make masks if user doesn't supply
if not use_cache: if not use_cache:
...@@ -837,7 +839,10 @@ class BartModel(PretrainedBartModel): ...@@ -837,7 +839,10 @@ class BartModel(PretrainedBartModel):
if encoder_outputs is None: if encoder_outputs is None:
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, input_ids=input_ids,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
assert isinstance(encoder_outputs, tuple) assert isinstance(encoder_outputs, tuple)
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
...@@ -849,6 +854,7 @@ class BartModel(PretrainedBartModel): ...@@ -849,6 +854,7 @@ class BartModel(PretrainedBartModel):
decoder_causal_mask=causal_mask, decoder_causal_mask=causal_mask,
decoder_cached_states=decoder_cached_states, decoder_cached_states=decoder_cached_states,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
use_cache=use_cache, use_cache=use_cache,
) )
...@@ -910,6 +916,7 @@ class BartForConditionalGeneration(PretrainedBartModel): ...@@ -910,6 +916,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
labels=None, labels=None,
use_cache=False, use_cache=False,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**unused, **unused,
): ):
r""" r"""
...@@ -926,7 +933,9 @@ class BartForConditionalGeneration(PretrainedBartModel): ...@@ -926,7 +933,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -969,6 +978,7 @@ class BartForConditionalGeneration(PretrainedBartModel): ...@@ -969,6 +978,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
decoder_cached_states=decoder_cached_states, decoder_cached_states=decoder_cached_states,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here
...@@ -1060,6 +1070,7 @@ class BartForSequenceClassification(PretrainedBartModel): ...@@ -1060,6 +1070,7 @@ class BartForSequenceClassification(PretrainedBartModel):
decoder_attention_mask=None, decoder_attention_mask=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1073,7 +1084,9 @@ class BartForSequenceClassification(PretrainedBartModel): ...@@ -1073,7 +1084,9 @@ class BartForSequenceClassification(PretrainedBartModel):
Classification loss (cross entropy) Classification loss (cross entropy)
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -1104,6 +1117,7 @@ class BartForSequenceClassification(PretrainedBartModel): ...@@ -1104,6 +1117,7 @@ class BartForSequenceClassification(PretrainedBartModel):
decoder_attention_mask=decoder_attention_mask, decoder_attention_mask=decoder_attention_mask,
encoder_outputs=encoder_outputs, encoder_outputs=encoder_outputs,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
x = outputs[0] # last hidden state x = outputs[0] # last hidden state
eos_mask = input_ids.eq(self.config.eos_token_id) eos_mask = input_ids.eq(self.config.eos_token_id)
...@@ -1148,6 +1162,7 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1148,6 +1162,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1167,7 +1182,9 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1167,7 +1182,9 @@ class BartForQuestionAnswering(PretrainedBartModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when
``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1206,6 +1223,7 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1206,6 +1223,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
decoder_attention_mask=decoder_attention_mask, decoder_attention_mask=decoder_attention_mask,
encoder_outputs=encoder_outputs, encoder_outputs=encoder_outputs,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -391,7 +391,6 @@ class BertLayer(nn.Module): ...@@ -391,7 +391,6 @@ class BertLayer(nn.Module):
class BertEncoder(nn.Module): class BertEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
def forward( def forward(
...@@ -402,11 +401,12 @@ class BertEncoder(nn.Module): ...@@ -402,11 +401,12 @@ class BertEncoder(nn.Module):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=False, output_attentions=False,
output_hidden_states=False,
): ):
all_hidden_states = () all_hidden_states = ()
all_attentions = () all_attentions = ()
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
...@@ -423,11 +423,11 @@ class BertEncoder(nn.Module): ...@@ -423,11 +423,11 @@ class BertEncoder(nn.Module):
all_attentions = all_attentions + (layer_outputs[1],) all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer # Add last layer
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -656,6 +656,7 @@ class BertModel(BertPreTrainedModel): ...@@ -656,6 +656,7 @@ class BertModel(BertPreTrainedModel):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
...@@ -671,7 +672,7 @@ class BertModel(BertPreTrainedModel): ...@@ -671,7 +672,7 @@ class BertModel(BertPreTrainedModel):
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -698,6 +699,9 @@ class BertModel(BertPreTrainedModel): ...@@ -698,6 +699,9 @@ class BertModel(BertPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -747,6 +751,7 @@ class BertModel(BertPreTrainedModel): ...@@ -747,6 +751,7 @@ class BertModel(BertPreTrainedModel):
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask, encoder_attention_mask=encoder_extended_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
...@@ -786,6 +791,7 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -786,6 +791,7 @@ class BertForPreTraining(BertPreTrainedModel):
labels=None, labels=None,
next_sentence_label=None, next_sentence_label=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -811,7 +817,7 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -811,7 +817,7 @@ class BertForPreTraining(BertPreTrainedModel):
seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax). continuation before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -854,6 +860,7 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -854,6 +860,7 @@ class BertForPreTraining(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
...@@ -902,6 +909,7 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -902,6 +909,7 @@ class BertLMHeadModel(BertPreTrainedModel):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -919,7 +927,7 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -919,7 +927,7 @@ class BertLMHeadModel(BertPreTrainedModel):
Next token prediction loss. Next token prediction loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -956,6 +964,7 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -956,6 +964,7 @@ class BertLMHeadModel(BertPreTrainedModel):
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1012,6 +1021,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1012,6 +1021,7 @@ class BertForMaskedLM(BertPreTrainedModel):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -1029,7 +1039,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1029,7 +1039,7 @@ class BertForMaskedLM(BertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1074,6 +1084,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1074,6 +1084,7 @@ class BertForMaskedLM(BertPreTrainedModel):
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1126,6 +1137,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1126,6 +1137,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
next_sentence_label=None, next_sentence_label=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1140,7 +1152,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1140,7 +1152,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
Next sequence prediction (classification) loss. Next sequence prediction (classification) loss.
seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1176,6 +1188,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1176,6 +1188,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1218,6 +1231,7 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1218,6 +1231,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1232,7 +1246,7 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1232,7 +1246,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1268,6 +1282,7 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1268,6 +1282,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1316,6 +1331,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1316,6 +1331,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1331,7 +1347,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1331,7 +1347,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1382,6 +1398,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1382,6 +1398,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1427,6 +1444,7 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1427,6 +1444,7 @@ class BertForTokenClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -1439,7 +1457,7 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1439,7 +1457,7 @@ class BertForTokenClassification(BertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1475,6 +1493,7 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1475,6 +1493,7 @@ class BertForTokenClassification(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1527,6 +1546,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1527,6 +1546,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1546,7 +1566,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1546,7 +1566,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1586,6 +1606,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1586,6 +1606,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -296,7 +296,6 @@ CTRL_INPUTS_DOCSTRING = r""" ...@@ -296,7 +296,6 @@ CTRL_INPUTS_DOCSTRING = r"""
class CTRLModel(CTRLPreTrainedModel): class CTRLModel(CTRLPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.d_model_size = config.n_embd self.d_model_size = config.n_embd
self.num_layers = config.n_layer self.num_layers = config.n_layer
...@@ -338,6 +337,7 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -338,6 +337,7 @@ class CTRLModel(CTRLPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
...@@ -347,7 +347,8 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -347,7 +347,8 @@ class CTRLModel(CTRLPreTrainedModel):
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. Can be used (see `past` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -374,6 +375,9 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -374,6 +375,9 @@ class CTRLModel(CTRLPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -446,7 +450,7 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -446,7 +450,7 @@ class CTRLModel(CTRLPreTrainedModel):
all_hidden_states = () all_hidden_states = ()
all_attentions = [] all_attentions = []
for i, (h, layer_past) in enumerate(zip(self.h, past)): for i, (h, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = h( outputs = h(
hidden_states, hidden_states,
...@@ -466,13 +470,13 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -466,13 +470,13 @@ class CTRLModel(CTRLPreTrainedModel):
hidden_states = self.layernorm(hidden_states) hidden_states = self.layernorm(hidden_states)
hidden_states = hidden_states.view(*output_shape) hidden_states = hidden_states.view(*output_shape)
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if use_cache is True: if use_cache is True:
outputs = outputs + (presents,) outputs = outputs + (presents,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
...@@ -518,6 +522,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -518,6 +522,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
labels=None, labels=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -536,7 +541,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -536,7 +541,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. Can be used (see `past` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -571,6 +577,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -571,6 +577,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -259,12 +259,11 @@ class Transformer(nn.Module): ...@@ -259,12 +259,11 @@ class Transformer(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.n_layers = config.n_layers self.n_layers = config.n_layers
self.output_hidden_states = config.output_hidden_states
layer = TransformerBlock(config) layer = TransformerBlock(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False): def forward(self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False):
""" """
Parameters Parameters
---------- ----------
...@@ -289,7 +288,7 @@ class Transformer(nn.Module): ...@@ -289,7 +288,7 @@ class Transformer(nn.Module):
hidden_state = x hidden_state = x
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,) all_hidden_states = all_hidden_states + (hidden_state,)
layer_outputs = layer_module( layer_outputs = layer_module(
...@@ -305,11 +304,11 @@ class Transformer(nn.Module): ...@@ -305,11 +304,11 @@ class Transformer(nn.Module):
assert len(layer_outputs) == 1 assert len(layer_outputs) == 1
# Add last layer # Add last layer
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_state,) all_hidden_states = all_hidden_states + (hidden_state,)
outputs = (hidden_state,) outputs = (hidden_state,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -411,14 +410,20 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -411,14 +410,20 @@ class DistilBertModel(DistilBertPreTrainedModel):
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def forward( def forward(
self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -445,6 +450,9 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -445,6 +450,9 @@ class DistilBertModel(DistilBertPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -466,7 +474,11 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -466,7 +474,11 @@ class DistilBertModel(DistilBertPreTrainedModel):
if inputs_embeds is None: if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim)
tfmr_output = self.transformer( tfmr_output = self.transformer(
x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions, x=inputs_embeds,
attn_mask=attention_mask,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_state = tfmr_output[0] hidden_state = tfmr_output[0]
output = (hidden_state,) + tfmr_output[1:] output = (hidden_state,) + tfmr_output[1:]
...@@ -480,7 +492,6 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -480,7 +492,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
class DistilBertForMaskedLM(DistilBertPreTrainedModel): class DistilBertForMaskedLM(DistilBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.distilbert = DistilBertModel(config) self.distilbert = DistilBertModel(config)
self.vocab_transform = nn.Linear(config.dim, config.dim) self.vocab_transform = nn.Linear(config.dim, config.dim)
...@@ -503,6 +514,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -503,6 +514,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -520,7 +532,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -520,7 +532,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -558,6 +570,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -558,6 +570,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = dlbrt_output[0] # (bs, seq_length, dim) hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
...@@ -599,6 +612,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -599,6 +612,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -613,7 +627,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -613,7 +627,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -644,6 +658,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -644,6 +658,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_state = distilbert_output[0] # (bs, seq_len, dim) hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = hidden_state[:, 0] # (bs, dim)
...@@ -691,6 +706,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -691,6 +706,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -710,7 +726,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -710,7 +726,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -742,6 +758,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -742,6 +758,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
...@@ -797,6 +814,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -797,6 +814,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -809,7 +827,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -809,7 +827,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -841,6 +859,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -841,6 +859,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -891,6 +910,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -891,6 +910,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -906,7 +926,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -906,7 +926,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -953,6 +973,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -953,6 +973,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim)
......
...@@ -273,13 +273,14 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -273,13 +273,14 @@ class ElectraModel(ElectraPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -307,6 +308,9 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -307,6 +308,9 @@ class ElectraModel(ElectraPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -339,6 +343,7 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -339,6 +343,7 @@ class ElectraModel(ElectraPreTrainedModel):
attention_mask=extended_attention_mask, attention_mask=extended_attention_mask,
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
return hidden_states return hidden_states
...@@ -388,6 +393,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -388,6 +393,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -402,7 +408,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -402,7 +408,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -430,7 +436,14 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -430,7 +436,14 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
) )
sequence_output = discriminator_hidden_states[0] sequence_output = discriminator_hidden_states[0]
...@@ -478,6 +491,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ...@@ -478,6 +491,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
...@@ -492,7 +506,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ...@@ -492,7 +506,8 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
Total loss of the ELECTRA objective. Total loss of the ELECTRA objective.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`)
Prediction scores of the head (scores for each token before SoftMax). Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -521,7 +536,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ...@@ -521,7 +536,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
...@@ -578,6 +600,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -578,6 +600,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -595,7 +618,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -595,7 +618,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -630,7 +653,14 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -630,7 +653,14 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
generator_hidden_states = self.electra( generator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
) )
generator_sequence_output = generator_hidden_states[0] generator_sequence_output = generator_hidden_states[0]
...@@ -677,6 +707,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -677,6 +707,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -689,7 +720,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -689,7 +720,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -718,7 +749,14 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -718,7 +749,14 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, output_attentions, input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
...@@ -776,6 +814,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -776,6 +814,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -795,7 +834,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -795,7 +834,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -833,6 +872,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -833,6 +872,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = discriminator_hidden_states[0] sequence_output = discriminator_hidden_states[0]
......
...@@ -131,13 +131,14 @@ class FlaubertModel(XLMModel): ...@@ -131,13 +131,14 @@ class FlaubertModel(XLMModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -162,6 +163,9 @@ class FlaubertModel(XLMModel): ...@@ -162,6 +163,9 @@ class FlaubertModel(XLMModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# removed: src_enc=None, src_len=None # removed: src_enc=None, src_len=None
if input_ids is not None: if input_ids is not None:
...@@ -240,7 +244,7 @@ class FlaubertModel(XLMModel): ...@@ -240,7 +244,7 @@ class FlaubertModel(XLMModel):
if self.training and (dropout_probability < self.layerdrop): if self.training and (dropout_probability < self.layerdrop):
continue continue
if self.output_hidden_states: if output_hidden_states:
hidden_states = hidden_states + (tensor,) hidden_states = hidden_states + (tensor,)
# self attention # self attention
...@@ -281,7 +285,7 @@ class FlaubertModel(XLMModel): ...@@ -281,7 +285,7 @@ class FlaubertModel(XLMModel):
tensor *= mask.unsqueeze(-1).to(tensor.dtype) tensor *= mask.unsqueeze(-1).to(tensor.dtype)
# Add last hidden state # Add last hidden state
if self.output_hidden_states: if output_hidden_states:
hidden_states = hidden_states + (tensor,) hidden_states = hidden_states + (tensor,)
# update cache length # update cache length
...@@ -292,7 +296,7 @@ class FlaubertModel(XLMModel): ...@@ -292,7 +296,7 @@ class FlaubertModel(XLMModel):
# tensor = tensor.transpose(0, 1) # tensor = tensor.transpose(0, 1)
outputs = (tensor,) outputs = (tensor,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (hidden_states,) outputs = outputs + (hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (attentions,) outputs = outputs + (attentions,)
......
...@@ -347,7 +347,6 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -347,7 +347,6 @@ GPT2_INPUTS_DOCSTRING = r"""
class GPT2Model(GPT2PreTrainedModel): class GPT2Model(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd)
...@@ -382,6 +381,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -382,6 +381,7 @@ class GPT2Model(GPT2PreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
...@@ -392,7 +392,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -392,7 +392,7 @@ class GPT2Model(GPT2PreTrainedModel):
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. Can be used (see `past` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``) is passed or when ``config.output_hidden_states=True``:
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -417,6 +417,9 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -417,6 +417,9 @@ class GPT2Model(GPT2PreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -486,7 +489,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -486,7 +489,7 @@ class GPT2Model(GPT2PreTrainedModel):
all_attentions = [] all_attentions = []
all_hidden_states = () all_hidden_states = ()
for i, (block, layer_past) in enumerate(zip(self.h, past)): for i, (block, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block( outputs = block(
...@@ -509,13 +512,13 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -509,13 +512,13 @@ class GPT2Model(GPT2PreTrainedModel):
hidden_states = hidden_states.view(*output_shape) hidden_states = hidden_states.view(*output_shape)
# Add last hidden state # Add last hidden state
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if use_cache is True: if use_cache is True:
outputs = outputs + (presents,) outputs = outputs + (presents,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
...@@ -561,6 +564,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -561,6 +564,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
labels=None, labels=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -579,7 +583,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -579,7 +583,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. Can be used (see `past` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -614,6 +618,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -614,6 +618,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -668,6 +673,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -668,6 +673,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
mc_labels=None, mc_labels=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -700,7 +706,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -700,7 +706,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. Can be used (see `past` input) to speed up sequential decoding.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -754,6 +760,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -754,6 +760,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -587,6 +587,7 @@ class LongformerModel(RobertaModel): ...@@ -587,6 +587,7 @@ class LongformerModel(RobertaModel):
position_ids=None, position_ids=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
...@@ -594,7 +595,7 @@ class LongformerModel(RobertaModel): ...@@ -594,7 +595,7 @@ class LongformerModel(RobertaModel):
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -627,6 +628,9 @@ class LongformerModel(RobertaModel): ...@@ -627,6 +628,9 @@ class LongformerModel(RobertaModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# padding # padding
attention_window = ( attention_window = (
...@@ -668,6 +672,7 @@ class LongformerModel(RobertaModel): ...@@ -668,6 +672,7 @@ class LongformerModel(RobertaModel):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
# undo padding # undo padding
...@@ -706,6 +711,7 @@ class LongformerForMaskedLM(BertPreTrainedModel): ...@@ -706,6 +711,7 @@ class LongformerForMaskedLM(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -723,7 +729,7 @@ class LongformerForMaskedLM(BertPreTrainedModel): ...@@ -723,7 +729,7 @@ class LongformerForMaskedLM(BertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -767,6 +773,7 @@ class LongformerForMaskedLM(BertPreTrainedModel): ...@@ -767,6 +773,7 @@ class LongformerForMaskedLM(BertPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
...@@ -810,6 +817,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel): ...@@ -810,6 +817,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -824,7 +832,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel): ...@@ -824,7 +832,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -864,6 +872,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel): ...@@ -864,6 +872,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
...@@ -931,6 +940,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): ...@@ -931,6 +940,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -949,7 +959,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): ...@@ -949,7 +959,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -997,6 +1007,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): ...@@ -997,6 +1007,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1057,6 +1068,7 @@ class LongformerForTokenClassification(BertPreTrainedModel): ...@@ -1057,6 +1068,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -1069,7 +1081,7 @@ class LongformerForTokenClassification(BertPreTrainedModel): ...@@ -1069,7 +1081,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1103,6 +1115,7 @@ class LongformerForTokenClassification(BertPreTrainedModel): ...@@ -1103,6 +1115,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1158,6 +1171,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel): ...@@ -1158,6 +1171,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
position_ids=None, position_ids=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1173,7 +1187,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel): ...@@ -1173,7 +1187,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1239,6 +1253,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel): ...@@ -1239,6 +1253,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
global_attention_mask=flat_global_attention_mask, global_attention_mask=flat_global_attention_mask,
inputs_embeds=flat_inputs_embeds, inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
......
...@@ -163,7 +163,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin): ...@@ -163,7 +163,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
objective during Bert pretraining. This output is usually *not* a good summary objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``: of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -200,6 +200,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin): ...@@ -200,6 +200,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
inputs_embeds=None, inputs_embeds=None,
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_hidden_states=None,
): ):
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
...@@ -257,6 +258,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin): ...@@ -257,6 +258,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
head_mask=head_mask, head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask, encoder_attention_mask=encoder_extended_attention_mask,
output_hidden_states=output_hidden_states,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
...@@ -293,7 +295,7 @@ class MMBTForClassification(nn.Module): ...@@ -293,7 +295,7 @@ class MMBTForClassification(nn.Module):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
**logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) **hidden_states**: (`optional`, returned when ``output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``: of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
......
...@@ -514,7 +514,6 @@ class MobileBertLayer(nn.Module): ...@@ -514,7 +514,6 @@ class MobileBertLayer(nn.Module):
class MobileBertEncoder(nn.Module): class MobileBertEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList([MobileBertLayer(config) for _ in range(config.num_hidden_layers)]) self.layer = nn.ModuleList([MobileBertLayer(config) for _ in range(config.num_hidden_layers)])
def forward( def forward(
...@@ -525,11 +524,12 @@ class MobileBertEncoder(nn.Module): ...@@ -525,11 +524,12 @@ class MobileBertEncoder(nn.Module):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=False, output_attentions=False,
output_hidden_states=False,
): ):
all_hidden_states = () all_hidden_states = ()
all_attentions = () all_attentions = ()
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
...@@ -546,11 +546,11 @@ class MobileBertEncoder(nn.Module): ...@@ -546,11 +546,11 @@ class MobileBertEncoder(nn.Module):
all_attentions = all_attentions + (layer_outputs[1],) all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer # Add last layer
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -757,6 +757,7 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -757,6 +757,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_hidden_states=None,
output_attentions=None, output_attentions=None,
): ):
r""" r"""
...@@ -773,7 +774,7 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -773,7 +774,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -801,6 +802,9 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -801,6 +802,9 @@ class MobileBertModel(MobileBertPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -852,6 +856,7 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -852,6 +856,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask, encoder_attention_mask=encoder_extended_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
...@@ -911,6 +916,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): ...@@ -911,6 +916,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
labels=None, labels=None,
next_sentence_label=None, next_sentence_label=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
...@@ -932,7 +938,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): ...@@ -932,7 +938,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False Prediction scores of the next sequence prediction (classification) head (scores of True/False
continuation before SoftMax). continuation before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -962,6 +968,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): ...@@ -962,6 +968,7 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
...@@ -1027,6 +1034,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): ...@@ -1027,6 +1034,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -1044,7 +1052,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): ...@@ -1044,7 +1052,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1087,6 +1095,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): ...@@ -1087,6 +1095,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask, encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1136,6 +1145,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): ...@@ -1136,6 +1145,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
next_sentence_label=None, next_sentence_label=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1150,7 +1160,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): ...@@ -1150,7 +1160,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
Next sequence prediction (classification) loss. Next sequence prediction (classification) loss.
seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1186,6 +1196,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): ...@@ -1186,6 +1196,7 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1227,6 +1238,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1227,6 +1238,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1240,7 +1252,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1240,7 +1252,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -1273,6 +1285,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1273,6 +1285,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output) pooled_output = self.dropout(pooled_output)
...@@ -1317,6 +1330,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): ...@@ -1317,6 +1330,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1336,7 +1350,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): ...@@ -1336,7 +1350,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1376,6 +1390,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): ...@@ -1376,6 +1390,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1432,6 +1447,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1432,6 +1447,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -1447,7 +1463,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1447,7 +1463,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1498,6 +1514,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1498,6 +1514,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1543,6 +1560,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1543,6 +1560,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -1555,7 +1573,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1555,7 +1573,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1591,6 +1609,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1591,6 +1609,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -334,7 +334,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -334,7 +334,6 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
...@@ -366,13 +365,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -366,13 +365,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the last layer of the model. Sequence of hidden-states at the last layer of the model.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -397,6 +397,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -397,6 +397,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -450,7 +453,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -450,7 +453,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
all_attentions = () all_attentions = ()
all_hidden_states = () all_hidden_states = ()
for i, block in enumerate(self.h): for i, block in enumerate(self.h):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions=output_attentions) outputs = block(hidden_states, attention_mask, head_mask[i], output_attentions=output_attentions)
...@@ -459,11 +462,11 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -459,11 +462,11 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
all_attentions = all_attentions + (outputs[1],) all_attentions = all_attentions + (outputs[1],)
# Add last layer # Add last layer
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
outputs = (hidden_states.view(*output_shape),) outputs = (hidden_states.view(*output_shape),)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -497,6 +500,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -497,6 +500,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -516,7 +520,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -516,7 +520,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -548,6 +552,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -548,6 +552,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) lm_logits = self.lm_head(hidden_states)
...@@ -600,6 +605,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -600,6 +605,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
labels=None, labels=None,
mc_labels=None, mc_labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -633,7 +639,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -633,7 +639,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -679,6 +685,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -679,6 +685,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -1256,7 +1256,7 @@ class _ReversibleFunction(Function): ...@@ -1256,7 +1256,7 @@ class _ReversibleFunction(Function):
num_hashes, num_hashes,
all_hidden_states, all_hidden_states,
all_attentions, all_attentions,
do_output_hidden_states, output_hidden_states,
output_attentions, output_attentions,
): ):
all_buckets = () all_buckets = ()
...@@ -1265,7 +1265,7 @@ class _ReversibleFunction(Function): ...@@ -1265,7 +1265,7 @@ class _ReversibleFunction(Function):
hidden_states, attn_output = torch.chunk(hidden_states, 2, dim=-1) hidden_states, attn_output = torch.chunk(hidden_states, 2, dim=-1)
for layer, layer_head_mask in zip(layers, head_mask): for layer, layer_head_mask in zip(layers, head_mask):
if do_output_hidden_states is True: if output_hidden_states is True:
all_hidden_states.append(hidden_states) all_hidden_states.append(hidden_states)
layer_outputs = layer( layer_outputs = layer(
...@@ -1284,7 +1284,7 @@ class _ReversibleFunction(Function): ...@@ -1284,7 +1284,7 @@ class _ReversibleFunction(Function):
all_attentions.append(layer_outputs.attention_probs) all_attentions.append(layer_outputs.attention_probs)
# Add last layer # Add last layer
if do_output_hidden_states is True: if output_hidden_states is True:
all_hidden_states.append(hidden_states) all_hidden_states.append(hidden_states)
# attach params to ctx for backward # attach params to ctx for backward
...@@ -1360,7 +1360,7 @@ class ReformerEncoder(nn.Module): ...@@ -1360,7 +1360,7 @@ class ReformerEncoder(nn.Module):
attention_mask=None, attention_mask=None,
head_mask=None, head_mask=None,
num_hashes=None, num_hashes=None,
do_output_hidden_states=False, output_hidden_states=False,
output_attentions=False, output_attentions=False,
): ):
# hidden_states and attention lists to be filled if wished # hidden_states and attention lists to be filled if wished
...@@ -1377,7 +1377,7 @@ class ReformerEncoder(nn.Module): ...@@ -1377,7 +1377,7 @@ class ReformerEncoder(nn.Module):
num_hashes, num_hashes,
all_hidden_states, all_hidden_states,
all_attentions, all_attentions,
do_output_hidden_states, output_hidden_states,
output_attentions, output_attentions,
) )
...@@ -1546,7 +1546,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1546,7 +1546,7 @@ class ReformerModel(ReformerPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
num_hashes=None, num_hashes=None,
do_output_hidden_states=False, output_hidden_states=None,
output_attentions=None, output_attentions=None,
): ):
r""" r"""
...@@ -1554,7 +1554,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1554,7 +1554,7 @@ class ReformerModel(ReformerPreTrainedModel):
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1581,7 +1581,9 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1581,7 +1581,9 @@ class ReformerModel(ReformerPreTrainedModel):
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
do_output_hidden_states = self.config.output_hidden_states output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -1639,7 +1641,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1639,7 +1641,7 @@ class ReformerModel(ReformerPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
attention_mask=attention_mask, attention_mask=attention_mask,
num_hashes=num_hashes, num_hashes=num_hashes,
do_output_hidden_states=do_output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
) )
sequence_output = encoder_outputs.hidden_states sequence_output = encoder_outputs.hidden_states
...@@ -1650,7 +1652,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1650,7 +1652,7 @@ class ReformerModel(ReformerPreTrainedModel):
outputs = (sequence_output,) outputs = (sequence_output,)
# TODO(PVP): Replace by named tuple after namedtuples are introduced in the library. # TODO(PVP): Replace by named tuple after namedtuples are introduced in the library.
if do_output_hidden_states is True: if output_hidden_states is True:
outputs = outputs + (encoder_outputs.all_hidden_states,) outputs = outputs + (encoder_outputs.all_hidden_states,)
if output_attentions is True: if output_attentions is True:
outputs = outputs + (encoder_outputs.all_attentions,) outputs = outputs + (encoder_outputs.all_attentions,)
...@@ -1740,7 +1742,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1740,7 +1742,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
num_hashes=None, num_hashes=None,
labels=None, labels=None,
do_output_hidden_states=False, output_hidden_states=None,
output_attentions=None, output_attentions=None,
): ):
r""" r"""
...@@ -1756,7 +1758,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1756,7 +1758,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Classification loss (cross entropy). Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1789,7 +1791,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1789,7 +1791,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
num_hashes=num_hashes, num_hashes=num_hashes,
do_output_hidden_states=do_output_hidden_states, output_hidden_states=output_hidden_states,
output_attentions=output_attentions, output_attentions=output_attentions,
) )
......
...@@ -188,6 +188,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -188,6 +188,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -205,7 +206,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -205,7 +206,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -245,6 +246,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -245,6 +246,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
...@@ -313,6 +315,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -313,6 +315,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -327,7 +330,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -327,7 +330,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -360,6 +363,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -360,6 +363,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
...@@ -407,6 +411,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -407,6 +411,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -422,7 +427,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -422,7 +427,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -468,6 +473,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -468,6 +473,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=flat_inputs_embeds, inputs_embeds=flat_inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -515,6 +521,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -515,6 +521,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -527,7 +534,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -527,7 +534,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
Classification loss. Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -561,6 +568,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -561,6 +568,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -636,6 +644,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -636,6 +644,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
start_positions=None, start_positions=None,
end_positions=None, end_positions=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
...@@ -655,7 +664,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -655,7 +664,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -695,6 +704,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -695,6 +704,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -629,7 +629,6 @@ class T5PreTrainedModel(PreTrainedModel): ...@@ -629,7 +629,6 @@ class T5PreTrainedModel(PreTrainedModel):
class T5Stack(T5PreTrainedModel): class T5Stack(T5PreTrainedModel):
def __init__(self, config, embed_tokens=None): def __init__(self, config, embed_tokens=None):
super().__init__(config) super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.embed_tokens = embed_tokens self.embed_tokens = embed_tokens
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
...@@ -662,9 +661,13 @@ class T5Stack(T5PreTrainedModel): ...@@ -662,9 +661,13 @@ class T5Stack(T5PreTrainedModel):
past_key_value_states=None, past_key_value_states=None,
use_cache=False, use_cache=False,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -726,7 +729,7 @@ class T5Stack(T5PreTrainedModel): ...@@ -726,7 +729,7 @@ class T5Stack(T5PreTrainedModel):
hidden_states = self.dropout(inputs_embeds) hidden_states = self.dropout(inputs_embeds)
for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)): for i, (layer_module, past_key_value_state) in enumerate(zip(self.block, past_key_value_states)):
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
...@@ -761,14 +764,14 @@ class T5Stack(T5PreTrainedModel): ...@@ -761,14 +764,14 @@ class T5Stack(T5PreTrainedModel):
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
# Add last layer # Add last layer
if self.output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if use_cache is True: if use_cache is True:
assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
outputs = outputs + (present_key_value_states,) outputs = outputs + (present_key_value_states,)
if self.output_hidden_states: if output_hidden_states:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if output_attentions: if output_attentions:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -895,6 +898,7 @@ class T5Model(T5PreTrainedModel): ...@@ -895,6 +898,7 @@ class T5Model(T5PreTrainedModel):
decoder_inputs_embeds=None, decoder_inputs_embeds=None,
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
): ):
r""" r"""
Returns: Returns:
...@@ -906,7 +910,7 @@ class T5Model(T5PreTrainedModel): ...@@ -906,7 +910,7 @@ class T5Model(T5PreTrainedModel):
Contains pre-computed key and value hidden-states of the attention blocks. Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -938,6 +942,7 @@ class T5Model(T5PreTrainedModel): ...@@ -938,6 +942,7 @@ class T5Model(T5PreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
...@@ -961,6 +966,7 @@ class T5Model(T5PreTrainedModel): ...@@ -961,6 +966,7 @@ class T5Model(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
if use_cache is True: if use_cache is True:
...@@ -1021,6 +1027,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1021,6 +1027,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
decoder_inputs_embeds=None, decoder_inputs_embeds=None,
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
**kwargs **kwargs
): ):
r""" r"""
...@@ -1043,7 +1050,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1043,7 +1050,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
Contains pre-computed key and value hidden-states of the attention blocks. Contains pre-computed key and value hidden-states of the attention blocks.
Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input).
Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -1085,6 +1092,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1085,6 +1092,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
head_mask=head_mask, head_mask=head_mask,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
...@@ -1113,6 +1121,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1113,6 +1121,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
head_mask=head_mask, head_mask=head_mask,
use_cache=use_cache, use_cache=use_cache,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
) )
# insert decoder past at right place # insert decoder past at right place
......
...@@ -361,13 +361,12 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -361,13 +361,12 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states
self.albert_layers = [ self.albert_layers = [
TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num) TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num)
] ]
def call(self, inputs, training=False): def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask, output_attentions = inputs hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs
layer_hidden_states = () layer_hidden_states = ()
layer_attentions = () layer_attentions = ()
...@@ -381,11 +380,11 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): ...@@ -381,11 +380,11 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
layer_attentions = layer_attentions + (layer_output[1],) layer_attentions = layer_attentions + (layer_output[1],)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
layer_hidden_states = layer_hidden_states + (hidden_states,) layer_hidden_states = layer_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (layer_hidden_states,) outputs = outputs + (layer_hidden_states,)
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
outputs = outputs + (layer_attentions,) outputs = outputs + (layer_attentions,)
...@@ -398,7 +397,6 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -398,7 +397,6 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.config = config self.config = config
self.output_hidden_states = config.output_hidden_states
self.embedding_hidden_mapping_in = tf.keras.layers.Dense( self.embedding_hidden_mapping_in = tf.keras.layers.Dense(
config.hidden_size, config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range), kernel_initializer=get_initializer(config.initializer_range),
...@@ -410,12 +408,12 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -410,12 +408,12 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
] ]
def call(self, inputs, training=False): def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask, output_attentions = inputs hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs
hidden_states = self.embedding_hidden_mapping_in(hidden_states) hidden_states = self.embedding_hidden_mapping_in(hidden_states)
all_attentions = () all_attentions = ()
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = (hidden_states,) all_hidden_states = (hidden_states,)
for i in range(self.config.num_hidden_layers): for i in range(self.config.num_hidden_layers):
...@@ -431,6 +429,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -431,6 +429,7 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
attention_mask, attention_mask,
head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
output_attentions, output_attentions,
output_hidden_states,
], ],
training=training, training=training,
) )
...@@ -439,11 +438,11 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -439,11 +438,11 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
all_attentions = all_attentions + layer_group_output[-1] all_attentions = all_attentions + layer_group_output[-1]
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -503,6 +502,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -503,6 +502,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.embeddings = TFAlbertEmbeddings(config, name="embeddings")
self.encoder = TFAlbertTransformer(config, name="encoder") self.encoder = TFAlbertTransformer(config, name="encoder")
...@@ -539,6 +539,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -539,6 +539,7 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -549,7 +550,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -549,7 +550,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -558,11 +560,13 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -558,11 +560,13 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -607,7 +611,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -607,7 +611,8 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
[embedding_output, extended_attention_mask, head_mask, output_attentions], training=training [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states],
training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
...@@ -710,38 +715,39 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -710,38 +715,39 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification) layer weights are trained from the next sentence prediction (classification)
objective during Albert pretraining. This output is usually *not* a good summary objective during Albert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
of shape :obj:`(batch_size, sequence_length, hidden_size)`. tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Hidden-states of the model at the output of each layer plus the initial embedding outputs.
tuple of :obj:`tf.Tensor` (one for each layer) of shape attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`: tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertModel.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 model = TFAlbertModel.from_pretrained('albert-base-v2')
outputs = model(input_ids) input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.albert(inputs, **kwargs) outputs = self.albert(inputs, **kwargs)
...@@ -774,7 +780,8 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): ...@@ -774,7 +780,8 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
...@@ -833,7 +840,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -833,7 +840,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -894,6 +902,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -894,6 +902,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -907,7 +916,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -907,7 +916,8 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -940,6 +950,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -940,6 +950,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -984,6 +995,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -984,6 +995,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -995,7 +1007,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -995,7 +1007,8 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1027,6 +1040,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1027,6 +1040,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -1073,6 +1087,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1073,6 +1087,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
p_mask=None, p_mask=None,
is_impossible=None, is_impossible=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1091,7 +1106,8 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1091,7 +1106,8 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1128,6 +1144,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1128,6 +1144,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -1184,6 +1201,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1184,6 +1201,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1198,7 +1216,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1198,7 +1216,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1266,6 +1285,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1266,6 +1285,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states,
] ]
outputs = self.albert(flat_inputs, training=training) outputs = self.albert(flat_inputs, training=training)
......
...@@ -378,16 +378,15 @@ class TFBertLayer(tf.keras.layers.Layer): ...@@ -378,16 +378,15 @@ class TFBertLayer(tf.keras.layers.Layer):
class TFBertEncoder(tf.keras.layers.Layer): class TFBertEncoder(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_hidden_states = config.output_hidden_states
self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
def call(self, inputs, training=False): def call(self, inputs, training=False):
hidden_states, attention_mask, head_mask, output_attentions = inputs hidden_states, attention_mask, head_mask, output_attentions, output_hidden_states = inputs
all_hidden_states = () all_hidden_states = ()
all_attentions = () all_attentions = ()
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module( layer_outputs = layer_module(
...@@ -399,11 +398,11 @@ class TFBertEncoder(tf.keras.layers.Layer): ...@@ -399,11 +398,11 @@ class TFBertEncoder(tf.keras.layers.Layer):
all_attentions = all_attentions + (layer_outputs[1],) all_attentions = all_attentions + (layer_outputs[1],)
# Add last layer # Add last layer
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -499,6 +498,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -499,6 +498,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.initializer_range = config.initializer_range self.initializer_range = config.initializer_range
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.embeddings = TFBertEmbeddings(config, name="embeddings") self.embeddings = TFBertEmbeddings(config, name="embeddings")
self.encoder = TFBertEncoder(config, name="encoder") self.encoder = TFBertEncoder(config, name="encoder")
...@@ -527,6 +527,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -527,6 +527,7 @@ class TFBertMainLayer(tf.keras.layers.Layer):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -537,7 +538,8 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -537,7 +538,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -546,11 +548,13 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -546,11 +548,13 @@ class TFBertMainLayer(tf.keras.layers.Layer):
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -595,7 +599,8 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -595,7 +599,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
[embedding_output, extended_attention_mask, head_mask, output_attentions], training=training [embedding_output, extended_attention_mask, head_mask, output_attentions, output_hidden_states],
training=training,
) )
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
...@@ -712,7 +717,8 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -712,7 +717,8 @@ class TFBertModel(TFBertPreTrainedModel):
objective during Bert pretraining. This output is usually *not* a good summary objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -764,7 +770,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -764,7 +770,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -818,7 +825,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -818,7 +825,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -868,7 +876,8 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -868,7 +876,8 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -931,6 +940,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -931,6 +940,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -944,7 +954,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -944,7 +954,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -977,6 +988,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -977,6 +988,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -1029,6 +1041,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1029,6 +1041,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1043,7 +1056,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1043,7 +1056,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1116,6 +1130,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1116,6 +1130,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
head_mask, head_mask,
flat_inputs_embeds, flat_inputs_embeds,
output_attentions, output_attentions,
output_hidden_states,
] ]
outputs = self.bert(flat_inputs, training=training) outputs = self.bert(flat_inputs, training=training)
...@@ -1162,6 +1177,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1162,6 +1177,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1173,7 +1189,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1173,7 +1189,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1205,6 +1222,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1205,6 +1222,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -1252,6 +1270,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1252,6 +1270,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
p_mask=None, p_mask=None,
is_impossible=None, is_impossible=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1270,7 +1289,8 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1270,7 +1289,8 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1305,6 +1325,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1305,6 +1325,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
......
...@@ -237,6 +237,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -237,6 +237,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
inputs_embeds=None, inputs_embeds=None,
use_cache=True, use_cache=True,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
...@@ -250,7 +251,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -250,7 +251,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 8 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
assert len(inputs) <= 9, "Too many inputs." output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
past = inputs.get("past", past) past = inputs.get("past", past)
...@@ -261,11 +263,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -261,11 +263,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
use_cache = inputs.get("use_cache", use_cache) use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 9, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
# If using past key value states, only the last tokens # If using past key value states, only the last tokens
# should be given as an input # should be given as an input
...@@ -351,7 +355,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -351,7 +355,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
all_hidden_states = () all_hidden_states = ()
all_attentions = [] all_attentions = []
for i, (h, layer_past) in enumerate(zip(self.h, past)): for i, (h, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
outputs = h( outputs = h(
[hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache, output_attentions], [hidden_states, mask, layer_past, attention_mask, head_mask[i], use_cache, output_attentions],
...@@ -367,13 +371,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -367,13 +371,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
hidden_states = self.layernorm(hidden_states) hidden_states = self.layernorm(hidden_states)
hidden_states = tf.reshape(hidden_states, output_shape) hidden_states = tf.reshape(hidden_states, output_shape)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if use_cache is True: if use_cache is True:
outputs = outputs + (presents,) outputs = outputs + (presents,)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
# let the number of heads free (-1) so we can extract attention even after head pruning # let the number of heads free (-1) so we can extract attention even after head pruning
...@@ -493,7 +497,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -493,7 +497,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -573,7 +577,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -573,7 +577,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Contains pre-computed hidden-states (key and values in the attention blocks). Contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
should not be passed as input ids as they have already been computed. should not be passed as input ids as they have already been computed.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
......
...@@ -351,7 +351,6 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -351,7 +351,6 @@ class TFTransformer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.n_layers = config.n_layers self.n_layers = config.n_layers
self.output_hidden_states = config.output_hidden_states
self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)]
...@@ -375,14 +374,14 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -375,14 +374,14 @@ class TFTransformer(tf.keras.layers.Layer):
Tuple of length n_layers with the attention weights from each layer Tuple of length n_layers with the attention weights from each layer
Optional: only if output_attentions=True Optional: only if output_attentions=True
""" """
x, attn_mask, head_mask, output_attentions = inputs x, attn_mask, head_mask, output_attentions, output_hidden_states = inputs
all_hidden_states = () all_hidden_states = ()
all_attentions = () all_attentions = ()
hidden_state = x hidden_state = x
for i, layer_module in enumerate(self.layer): for i, layer_module in enumerate(self.layer):
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_state,) all_hidden_states = all_hidden_states + (hidden_state,)
layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i], output_attentions], training=training) layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i], output_attentions], training=training)
...@@ -396,11 +395,11 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -396,11 +395,11 @@ class TFTransformer(tf.keras.layers.Layer):
assert len(layer_outputs) == 1 assert len(layer_outputs) == 1
# Add last layer # Add last layer
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
all_hidden_states = all_hidden_states + (hidden_state,) all_hidden_states = all_hidden_states + (hidden_state,)
outputs = (hidden_state,) outputs = (hidden_state,)
if self.output_hidden_states: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions) is True:
outputs = outputs + (all_attentions,) outputs = outputs + (all_attentions,)
...@@ -415,6 +414,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -415,6 +414,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings
self.transformer = TFTransformer(config, name="transformer") # Encoder self.transformer = TFTransformer(config, name="transformer") # Encoder
...@@ -430,7 +430,14 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -430,7 +430,14 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def call( def call(
self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, training=False self,
inputs,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -438,18 +445,21 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -438,18 +445,21 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
output_attentions = inputs[4] if len(inputs) > 4 else output_attentions output_attentions = inputs[4] if len(inputs) > 4 else output_attentions
assert len(inputs) <= 5, "Too many inputs." output_hidden_states = inputs[5] if len(inputs) > 5 else output_hidden_states
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 5, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 6, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.output_attentions output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -476,7 +486,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -476,7 +486,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim)
tfmr_output = self.transformer( tfmr_output = self.transformer(
[embedding_output, attention_mask, head_mask, output_attentions], training=training [embedding_output, attention_mask, head_mask, output_attentions, output_hidden_states], training=training
) )
return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
...@@ -571,7 +581,8 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -571,7 +581,8 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -622,7 +633,6 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -622,7 +633,6 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
...@@ -644,7 +654,8 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -644,7 +654,8 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -710,6 +721,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -710,6 +721,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -723,7 +735,8 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -723,7 +735,8 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -753,6 +766,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -753,6 +766,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -796,6 +810,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -796,6 +810,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -807,7 +822,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -807,7 +822,8 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -837,6 +853,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -837,6 +853,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
...@@ -893,6 +910,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -893,6 +910,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -907,7 +925,8 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -907,7 +925,8 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -964,6 +983,8 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -964,6 +983,8 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
flat_attention_mask, flat_attention_mask,
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions,
output_hidden_states,
] ]
distilbert_output = self.distilbert(flat_inputs, training=training) distilbert_output = self.distilbert(flat_inputs, training=training)
...@@ -1012,6 +1033,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -1012,6 +1033,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
p_mask=None, p_mask=None,
is_impossible=None, is_impossible=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1030,7 +1052,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -1030,7 +1052,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -1061,6 +1084,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -1061,6 +1084,8 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
......
...@@ -240,6 +240,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): ...@@ -240,6 +240,7 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -250,7 +251,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): ...@@ -250,7 +251,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
output_attentions = inputs[6] if len(inputs) > 6 else output_attentions output_attentions = inputs[6] if len(inputs) > 6 else output_attentions
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs[7] if len(inputs) > 7 else output_hidden_states
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -259,11 +261,15 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): ...@@ -259,11 +261,15 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 7, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 8, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -288,7 +294,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): ...@@ -288,7 +294,8 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
hidden_states = self.embeddings_project(hidden_states, training=training) hidden_states = self.embeddings_project(hidden_states, training=training)
hidden_states = self.encoder( hidden_states = self.encoder(
[hidden_states, extended_attention_mask, head_mask, output_attentions], training=training [hidden_states, extended_attention_mask, head_mask, output_attentions, output_hidden_states],
training=training,
) )
return hidden_states return hidden_states
...@@ -382,7 +389,8 @@ class TFElectraModel(TFElectraPreTrainedModel): ...@@ -382,7 +389,8 @@ class TFElectraModel(TFElectraPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -433,6 +441,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): ...@@ -433,6 +441,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -440,7 +449,8 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): ...@@ -440,7 +449,8 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Prediction scores of the head (scores for each token before SoftMax). Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -471,6 +481,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): ...@@ -471,6 +481,7 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
...@@ -530,6 +541,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -530,6 +541,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -537,7 +549,8 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -537,7 +549,8 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -569,6 +582,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -569,6 +582,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training, training=training,
) )
generator_sequence_output = generator_hidden_states[0] generator_sequence_output = generator_hidden_states[0]
...@@ -607,6 +621,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -607,6 +621,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
inputs_embeds=None, inputs_embeds=None,
labels=None, labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -618,7 +633,8 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -618,7 +633,8 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -651,6 +667,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -651,6 +667,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
...@@ -696,6 +713,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -696,6 +713,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
p_mask=None, p_mask=None,
is_impossible=None, is_impossible=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None,
training=False, training=False,
): ):
r""" r"""
...@@ -714,7 +732,8 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -714,7 +732,8 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
Span-start scores (before SoftMax). Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax). Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when
``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
...@@ -749,6 +768,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -749,6 +768,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
head_mask, head_mask,
inputs_embeds, inputs_embeds,
output_attentions, output_attentions,
output_hidden_states,
training=training, training=training,
) )
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment