Unverified Commit 4dc65591 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming...


[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming and keras compile (#5395)

* add first version of clm tf

* make style

* add more tests for bert

* update tf clm loss

* fix tests

* correct tf ner script

* add mlm loss

* delete bogus file

* clean tf auto model + add tests

* finish adding clm loss everywhere

* fix training in distilbert

* fix flake8

* save intermediate

* fix tf t5 naming

* remove prints

* finish up

* up

* fix tf gpt2

* fix new test utils import

* fix flake8

* keep backward compatibility

* Update src/transformers/modeling_tf_albert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_electra.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_roberta.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_mobilebert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_bert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_distilbert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply sylvains suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 33e43edd
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import logging import logging
import os import os
import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -184,7 +185,12 @@ def main(): ...@@ -184,7 +185,12 @@ def main():
for i in range(batch_size): for i in range(batch_size):
for j in range(seq_len): for j in range(seq_len):
if label_ids[i, j] != -1: if label_ids[i, j] == -1:
label_ids[i, j] = -100
warnings.warn(
"Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead."
)
if label_ids[i, j] != -100:
out_label_list[i].append(label_map[label_ids[i][j]]) out_label_list[i].append(label_map[label_ids[i][j]])
preds_list[i].append(label_map[preds[i][j]]) preds_list[i].append(label_map[preds[i][j]])
......
...@@ -453,6 +453,9 @@ if is_tf_available(): ...@@ -453,6 +453,9 @@ if is_tf_available():
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TFAutoModel, TFAutoModel,
TFAutoModelForMultipleChoice, TFAutoModelForMultipleChoice,
TFAutoModelForPreTraining, TFAutoModelForPreTraining,
...@@ -460,6 +463,9 @@ if is_tf_available(): ...@@ -460,6 +463,9 @@ if is_tf_available():
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
TFAutoModelForTokenClassification, TFAutoModelForTokenClassification,
TFAutoModelWithLMHead, TFAutoModelWithLMHead,
TFAutoModelForCausalLM,
TFAutoModelForMaskedLM,
TFAutoModelForSeq2SeqLM,
) )
from .modeling_tf_albert import ( from .modeling_tf_albert import (
...@@ -478,6 +484,7 @@ if is_tf_available(): ...@@ -478,6 +484,7 @@ if is_tf_available():
from .modeling_tf_bert import ( from .modeling_tf_bert import (
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBertEmbeddings, TFBertEmbeddings,
TFBertLMHeadModel,
TFBertForMaskedLM, TFBertForMaskedLM,
TFBertForMultipleChoice, TFBertForMultipleChoice,
TFBertForNextSentencePrediction, TFBertForNextSentencePrediction,
......
...@@ -73,6 +73,7 @@ from .modeling_bert import ( ...@@ -73,6 +73,7 @@ from .modeling_bert import (
from .modeling_camembert import ( from .modeling_camembert import (
CamembertForMaskedLM, CamembertForMaskedLM,
CamembertForMultipleChoice, CamembertForMultipleChoice,
CamembertForQuestionAnswering,
CamembertForSequenceClassification, CamembertForSequenceClassification,
CamembertForTokenClassification, CamembertForTokenClassification,
CamembertModel, CamembertModel,
...@@ -306,6 +307,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( ...@@ -306,6 +307,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[ [
(DistilBertConfig, DistilBertForQuestionAnswering), (DistilBertConfig, DistilBertForQuestionAnswering),
(AlbertConfig, AlbertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering),
(CamembertConfig, CamembertForQuestionAnswering),
(BartConfig, BartForQuestionAnswering), (BartConfig, BartForQuestionAnswering),
(LongformerConfig, LongformerForQuestionAnswering), (LongformerConfig, LongformerForQuestionAnswering),
(XLMRobertaConfig, XLMRobertaForQuestionAnswering), (XLMRobertaConfig, XLMRobertaForQuestionAnswering),
...@@ -336,7 +338,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -336,7 +338,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
] ]
) )
MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[ [
(CamembertConfig, CamembertForMultipleChoice), (CamembertConfig, CamembertForMultipleChoice),
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -822,7 +823,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer): ...@@ -822,7 +823,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING) @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -834,8 +835,26 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -834,8 +835,26 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj::obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
...@@ -852,14 +871,35 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -852,14 +871,35 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.albert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.albert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.predictions(sequence_output, training=training)
# Add hidden states and attention if they are here # Add hidden states and attention if they are here
outputs = (prediction_scores,) + outputs[2:] outputs = (prediction_scores,) + outputs[2:]
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # prediction_scores, (hidden_states), (attentions)
......
This diff is collapsed.
...@@ -29,6 +29,8 @@ from .file_utils import ( ...@@ -29,6 +29,8 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -803,9 +805,12 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -803,9 +805,12 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
class TFBertForMaskedLM(TFBertPreTrainedModel): class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
assert (
not config.is_decoder
), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
...@@ -815,8 +820,26 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -815,8 +820,26 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -833,13 +856,113 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -833,13 +856,113 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.bert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.bert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
assert config.is_decoder, "If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`"
self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
def get_output_embeddings(self):
return self.bert.embeddings
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.bert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0]
logits = self.mlm(sequence_output, training=training)
outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # prediction_scores, (hidden_states), (attentions)
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFSharedEmbeddings, TFSharedEmbeddings,
cast_bool_to_primitive, cast_bool_to_primitive,
...@@ -542,7 +543,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): ...@@ -542,7 +543,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
) )
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
...@@ -561,8 +562,26 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -561,8 +562,26 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs): def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -583,11 +602,37 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -583,11 +602,37 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels
if len(inputs) > 10:
inputs = inputs[:10]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) logits = self.lm_head(hidden_states)
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (logits,) + transformer_outputs[1:]
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) return outputs # lm_logits, presents, (all hidden_states), (attentions)
...@@ -30,6 +30,7 @@ from .file_utils import ( ...@@ -30,6 +30,7 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -116,7 +117,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -116,7 +117,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear". mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with outputs: (1) If mode == "embedding", output embedding tensor, float32 with
...@@ -528,9 +529,9 @@ DISTILBERT_START_DOCSTRING = r""" ...@@ -528,9 +529,9 @@ DISTILBERT_START_DOCSTRING = r"""
- a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
:obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` :obj:`model([input_ids, attention_mask])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring: - a dictionary with one or several input Tensors associated to the input names given in the docstring:
:obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` :obj:`model({'input_ids': input_ids})`
Parameters: Parameters:
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
...@@ -626,7 +627,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -626,7 +627,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
...@@ -644,8 +645,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -644,8 +645,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
...@@ -663,7 +679,22 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -663,7 +679,22 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
distilbert_output = self.distilbert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
if len(inputs) > 6:
inputs = inputs[:6]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
distilbert_output = self.distilbert(
inputs,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = distilbert_output[0] # (bs, seq_length, dim) hidden_states = distilbert_output[0] # (bs, seq_length, dim)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
...@@ -672,6 +703,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -672,6 +703,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
prediction_logits = self.vocab_projector(prediction_logits) prediction_logits = self.vocab_projector(prediction_logits)
outputs = (prediction_logits,) + distilbert_output[1:] outputs = (prediction_logits,) + distilbert_output[1:]
if labels is not None:
loss = self.compute_loss(labels, prediction_logits)
outputs = (loss,) + outputs
return outputs # logits, (hidden_states), (attentions) return outputs # logits, (hidden_states), (attentions)
......
...@@ -7,6 +7,7 @@ from transformers import ElectraConfig ...@@ -7,6 +7,7 @@ from transformers import ElectraConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
...@@ -506,7 +507,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -506,7 +507,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
the only model of the two to have been trained for the masked language modeling task.""", the only model of the two to have been trained for the masked language modeling task.""",
ELECTRA_START_DOCSTRING, ELECTRA_START_DOCSTRING,
) )
class TFElectraForMaskedLM(TFElectraPreTrainedModel): class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
...@@ -534,9 +535,16 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -534,9 +535,16 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -553,6 +561,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -553,6 +561,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if isinstance(input_ids, (tuple, list)):
labels = input_ids[8] if len(input_ids) > 8 else labels
if len(input_ids) > 8:
input_ids = input_ids[:8]
elif isinstance(input_ids, (dict, BatchEncoding)):
labels = input_ids.pop("labels", labels)
generator_hidden_states = self.electra( generator_hidden_states = self.electra(
input_ids, input_ids,
...@@ -571,6 +585,10 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -571,6 +585,10 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
output = (prediction_scores,) output = (prediction_scores,)
output += generator_hidden_states[1:] output += generator_hidden_states[1:]
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
output = (loss,) + output
return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceSummary, TFSequenceSummary,
...@@ -272,8 +273,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -272,8 +273,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 7 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
...@@ -524,7 +525,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -524,7 +525,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
...@@ -541,8 +542,26 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -541,8 +542,26 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs): def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -563,12 +582,38 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -563,12 +582,38 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels
if len(inputs) > 10:
inputs = inputs[:10]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.transformer.wte(hidden_states, mode="linear") logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (logits,) + transformer_outputs[1:]
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) return outputs # lm_logits, presents, (all hidden_states), (attentions)
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -929,7 +930,7 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -929,7 +930,7 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -941,8 +942,25 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -941,8 +942,25 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -959,14 +977,34 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -959,14 +977,34 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.mobilebert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.mobilebert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceSummary, TFSequenceSummary,
...@@ -479,7 +480,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -479,7 +480,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
...@@ -489,8 +490,24 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -489,8 +490,24 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -507,12 +524,35 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -507,12 +524,35 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:]
outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, (all hidden_states), (attentions) return outputs # lm_logits, (all hidden_states), (attentions)
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -264,7 +265,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -264,7 +265,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -276,8 +277,26 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -276,8 +277,26 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -294,14 +313,37 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -294,14 +313,37 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.roberta(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.roberta(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0]
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
return outputs # prediction_scores, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFRobertaClassificationHead(tf.keras.layers.Layer): class TFRobertaClassificationHead(tf.keras.layers.Layer):
......
This diff is collapsed.
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import functools import functools
import logging import logging
import os import os
import warnings
import h5py import h5py
import numpy as np import numpy as np
...@@ -107,6 +108,19 @@ def keras_serializable(cls): ...@@ -107,6 +108,19 @@ def keras_serializable(cls):
return cls return cls
class TFCausalLanguageModelingLoss:
def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
# make sure only labels that are not equal to -100
# are taken into account as loss
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
class TFQuestionAnsweringLoss: class TFQuestionAnsweringLoss:
def compute_loss(self, labels, logits): def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
...@@ -123,7 +137,13 @@ class TFTokenClassificationLoss: ...@@ -123,7 +137,13 @@ class TFTokenClassificationLoss:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE from_logits=True, reduction=tf.keras.losses.Reduction.NONE
) )
active_loss = tf.reshape(labels, (-1,)) != -1 # make sure only labels that are not equal to -100
# are taken into account as loss
if tf.math.reduce_any(labels == -1).numpy() is True:
warnings.warn("Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead.")
active_loss = tf.reshape(labels, (-1,)) != -1
else:
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
...@@ -143,6 +163,7 @@ class TFSequenceClassificationLoss: ...@@ -143,6 +163,7 @@ class TFSequenceClassificationLoss:
TFMultipleChoiceLoss = TFSequenceClassificationLoss TFMultipleChoiceLoss = TFSequenceClassificationLoss
TFMaskedLanguageModelingLoss = TFCausalLanguageModelingLoss
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
......
...@@ -30,6 +30,7 @@ from .file_utils import ( ...@@ -30,6 +30,7 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -871,7 +872,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -871,7 +872,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -912,8 +913,28 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -912,8 +913,28 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
return inputs return inputs
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -957,12 +978,40 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -957,12 +978,40 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 12:
inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
training=training,
)
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state) logits = self.lm_loss(hidden_state, training=training)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # return logits, (mems), (hidden states), (attentions) return outputs # return logits, (mems), (hidden states), (attentions)
......
...@@ -1041,9 +1041,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1041,9 +1041,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True, use_cache=True,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import is_torch_available from transformers import is_torch_available
from transformers.testing_utils import require_torch, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, ids_tensor from .test_modeling_common import ModelTesterMixin, ids_tensor
...@@ -32,6 +32,7 @@ if is_torch_available(): ...@@ -32,6 +32,7 @@ if is_torch_available():
DistilBertForTokenClassification, DistilBertForTokenClassification,
DistilBertForQuestionAnswering, DistilBertForQuestionAnswering,
DistilBertForSequenceClassification, DistilBertForSequenceClassification,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
class DistilBertModelTester(object): class DistilBertModelTester(object):
...@@ -276,8 +277,8 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -276,8 +277,8 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs) self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
# @slow @slow
# def test_model_from_pretrained(self): def test_model_from_pretrained(self):
# for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
# model = DistilBertModel.from_pretrained(model_name) model = DistilBertModel.from_pretrained(model_name)
# self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -24,6 +24,8 @@ if is_tf_available(): ...@@ -24,6 +24,8 @@ if is_tf_available():
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
BertConfig, BertConfig,
GPT2Config,
T5Config,
TFAutoModel, TFAutoModel,
TFBertModel, TFBertModel,
TFAutoModelForPreTraining, TFAutoModelForPreTraining,
...@@ -35,6 +37,25 @@ if is_tf_available(): ...@@ -35,6 +37,25 @@ if is_tf_available():
TFBertForSequenceClassification, TFBertForSequenceClassification,
TFAutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering,
TFBertForQuestionAnswering, TFBertForQuestionAnswering,
TFAutoModelForCausalLM,
TFGPT2LMHeadModel,
TFAutoModelForMaskedLM,
TFAutoModelForSeq2SeqLM,
TFT5ForConditionalGeneration,
)
from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_auto import (
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
) )
...@@ -72,10 +93,21 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -72,10 +93,21 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForPreTraining) self.assertIsInstance(model, TFBertForPreTraining)
@slow
def test_model_for_causal_lm(self):
for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, GPT2Config)
model = TFAutoModelForCausalLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFGPT2LMHeadModel)
@slow @slow
def test_lmhead_model_from_pretrained(self): def test_lmhead_model_from_pretrained(self):
# for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
for model_name in ["bert-base-uncased"]:
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config) self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig) self.assertIsInstance(config, BertConfig)
...@@ -84,6 +116,30 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -84,6 +116,30 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForMaskedLM) self.assertIsInstance(model, TFBertForMaskedLM)
@slow
def test_model_for_masked_lm(self):
for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForMaskedLM)
@slow
def test_model_for_encoder_decoder_lm(self):
for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, T5Config)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFT5ForConditionalGeneration)
@slow @slow
def test_sequence_classification_model_from_pretrained(self): def test_sequence_classification_model_from_pretrained(self):
# for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
...@@ -119,3 +175,28 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -119,3 +175,28 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsInstance(model, TFRobertaForMaskedLM) self.assertIsInstance(model, TFRobertaForMaskedLM)
self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(), 14830)
self.assertEqual(model.num_parameters(only_trainable=True), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830)
def test_parents_and_children_in_mappings(self):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings = (
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
)
for mapping in mappings:
mapping = tuple(mapping.items())
for index, (child_config, child_model) in enumerate(mapping[1:]):
for parent_config, parent_model in mapping[: index + 1]:
with self.subTest(
msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
):
self.assertFalse(issubclass(child_config, parent_config))
self.assertFalse(issubclass(child_model, parent_model))
...@@ -27,6 +27,7 @@ if is_tf_available(): ...@@ -27,6 +27,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers.modeling_tf_bert import ( from transformers.modeling_tf_bert import (
TFBertModel, TFBertModel,
TFBertLMHeadModel,
TFBertForMaskedLM, TFBertForMaskedLM,
TFBertForNextSentencePrediction, TFBertForNextSentencePrediction,
TFBertForPreTraining, TFBertForPreTraining,
...@@ -142,11 +143,30 @@ class TFBertModelTester: ...@@ -142,11 +143,30 @@ class TFBertModelTester:
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_bert_lm_head(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.is_decoder = True
model = TFBertLMHeadModel(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(prediction_scores,) = model(inputs)
self.parent.assertListEqual(
list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_bert_for_masked_lm( def create_and_check_bert_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForMaskedLM(config=config) model = TFBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(prediction_scores,) = model(inputs) (prediction_scores,) = model(inputs)
result = { result = {
"prediction_scores": prediction_scores.numpy(), "prediction_scores": prediction_scores.numpy(),
...@@ -186,11 +206,14 @@ class TFBertModelTester: ...@@ -186,11 +206,14 @@ class TFBertModelTester:
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFBertForSequenceClassification(config=config) model = TFBertForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
(logits,) = model(inputs) "input_ids": input_ids,
result = { "attention_mask": input_mask,
"logits": logits.numpy(), "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice( def create_and_check_bert_for_multiple_choice(
...@@ -207,9 +230,7 @@ class TFBertModelTester: ...@@ -207,9 +230,7 @@ class TFBertModelTester:
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {"logits": logits.numpy()}
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_bert_for_token_classification( def create_and_check_bert_for_token_classification(
...@@ -217,7 +238,11 @@ class TFBertModelTester: ...@@ -217,7 +238,11 @@ class TFBertModelTester:
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFBertForTokenClassification(config=config) model = TFBertForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {
"logits": logits.numpy(), "logits": logits.numpy(),
...@@ -228,12 +253,14 @@ class TFBertModelTester: ...@@ -228,12 +253,14 @@ class TFBertModelTester:
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForQuestionAnswering(config=config) model = TFBertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
start_logits, end_logits = model(inputs) "input_ids": input_ids,
result = { "attention_mask": input_mask,
"start_logits": start_logits.numpy(), "token_type_ids": token_type_ids,
"end_logits": end_logits.numpy(),
} }
start_logits, end_logits = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
...@@ -285,6 +312,10 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -285,6 +312,10 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs) self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
def test_for_causal_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_lm_head(*config_and_inputs)
def test_for_multiple_choice(self): def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment