Unverified Commit f9414f75 authored by Julien Plu's avatar Julien Plu Committed by GitHub
Browse files

Tensorflow improvements (#4530)



* Better None gradients handling

* Apply Style

* Apply Style

* Create a loss class per task to compute its respective loss

* Add loss classes to the ALBERT TF models

* Add loss classes to the BERT TF models

* Add question answering and multiple choice to TF Camembert

* Remove prints

* Add multiple choice model to TF DistilBERT + loss computation

* Add question answering model to TF Electra + loss computation

* Add token classification, question answering and multiple choice models to TF Flaubert

* Add multiple choice model to TF Roberta + loss computation

* Add multiple choice model to TF XLM + loss computation

* Add multiple choice and question answering models to TF XLM-Roberta

* Add multiple choice model to TF XLNet + loss computation

* Remove unused parameters

* Add task loss classes

* Reorder TF imports + add new model classes

* Add new model classes

* Bugfix in TF T5 model

* Bugfix for TF T5 tests

* Bugfix in TF T5 model

* Fix TF T5 model tests

* Fix T5 tests + some renaming

* Fix inheritance issue in the AutoX tests

* Add tests for TF Flaubert and TF XLM Roberta

* Add tests for TF Flaubert and TF XLM Roberta

* Remove unused piece of code in the TF trainer

* bugfix and remove unused code

* Bugfix for TF 2.2

* Apply Style

* Divide TFSequenceClassificationAndMultipleChoiceLoss into their two respective name

* Apply style

* Mirror the PT Trainer in the TF one: fp16, optimizers and tb_writer as class parameter and better dataset handling

* Fix TF optimizations tests and apply style

* Remove useless parameter

* Bugfix and apply style

* Fix TF Trainer prediction

* Now the TF models return the loss such as their PyTorch couterparts

* Apply Style

* Ignore some tests output

* Take into account the SQuAD cls_index, p_mask and is_impossible parameters for the QuestionAnswering task models.

* Fix names for SQuAD data

* Apply Style

* Fix conflicts with 2.11 release

* Fix conflicts with 2.11

* Fix wrongname

* Add better documentation on the new create_optimizer function

* Fix isort

* logging_dir: use same default as PyTorch
Co-authored-by: default avatarJulien Chaumond <chaumond@gmail.com>
parent ccd26c28
...@@ -8,6 +8,10 @@ __pycache__/ ...@@ -8,6 +8,10 @@ __pycache__/
# C extensions # C extensions
*.so *.so
# tests and logs
tests/fixtures
logs/
# Distribution / packaging # Distribution / packaging
.Python .Python
build/ build/
......
...@@ -352,173 +352,193 @@ if is_torch_available(): ...@@ -352,173 +352,193 @@ if is_torch_available():
# TensorFlow # TensorFlow
if is_tf_available(): if is_tf_available():
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFPreTrainedModel,
TFSharedEmbeddings,
TFSequenceSummary,
shape_list, shape_list,
tf_top_k_top_p_filtering, tf_top_k_top_p_filtering,
TFPreTrainedModel,
TFSequenceSummary,
TFSharedEmbeddings,
) )
from .modeling_tf_auto import ( from .modeling_tf_auto import (
TFAutoModel,
TFAutoModelForPreTraining,
TFAutoModelForMultipleChoice,
TFAutoModelForSequenceClassification,
TFAutoModelForQuestionAnswering,
TFAutoModelWithLMHead,
TFAutoModelForTokenClassification,
TF_MODEL_MAPPING, TF_MODEL_MAPPING,
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING, TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TFAutoModel,
TFAutoModelForMultipleChoice,
TFAutoModelForPreTraining,
TFAutoModelForQuestionAnswering,
TFAutoModelForSequenceClassification,
TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
)
from .modeling_tf_albert import (
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFAlbertForMaskedLM,
TFAlbertForMultipleChoice,
TFAlbertForPreTraining,
TFAlbertForQuestionAnswering,
TFAlbertForSequenceClassification,
TFAlbertForTokenClassification,
TFAlbertMainLayer,
TFAlbertModel,
TFAlbertPreTrainedModel,
) )
from .modeling_tf_bert import ( from .modeling_tf_bert import (
TFBertPreTrainedModel, TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBertMainLayer,
TFBertEmbeddings, TFBertEmbeddings,
TFBertModel,
TFBertForPreTraining,
TFBertForMaskedLM, TFBertForMaskedLM,
TFBertForMultipleChoice,
TFBertForNextSentencePrediction, TFBertForNextSentencePrediction,
TFBertForPreTraining,
TFBertForQuestionAnswering,
TFBertForSequenceClassification, TFBertForSequenceClassification,
TFBertForMultipleChoice,
TFBertForTokenClassification, TFBertForTokenClassification,
TFBertForQuestionAnswering, TFBertMainLayer,
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, TFBertModel,
TFBertPreTrainedModel,
) )
from .modeling_tf_gpt2 import ( from .modeling_tf_camembert import (
TFGPT2PreTrainedModel, TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFGPT2MainLayer, TFCamembertForMaskedLM,
TFGPT2Model, TFCamembertModel,
TFGPT2LMHeadModel, TFCamembertForMultipleChoice,
TFGPT2DoubleHeadsModel, TFCamembertForQuestionAnswering,
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, TFCamembertForSequenceClassification,
TFCamembertForTokenClassification,
) )
from .modeling_tf_openai import ( from .modeling_tf_ctrl import (
TFOpenAIGPTPreTrainedModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFOpenAIGPTMainLayer, TFCTRLLMHeadModel,
TFOpenAIGPTModel, TFCTRLModel,
TFOpenAIGPTLMHeadModel, TFCTRLPreTrainedModel,
TFOpenAIGPTDoubleHeadsModel,
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
from .modeling_tf_transfo_xl import ( from .modeling_tf_distilbert import (
TFTransfoXLPreTrainedModel, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFTransfoXLMainLayer, TFDistilBertForMaskedLM,
TFTransfoXLModel, TFDistilBertForMultipleChoice,
TFTransfoXLLMHeadModel, TFDistilBertForQuestionAnswering,
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, TFDistilBertForSequenceClassification,
TFAdaptiveEmbedding, TFDistilBertForTokenClassification,
TFDistilBertMainLayer,
TFDistilBertModel,
TFDistilBertPreTrainedModel,
) )
from .modeling_tf_xlnet import ( from .modeling_tf_electra import (
TFXLNetPreTrainedModel, TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXLNetMainLayer, TFElectraForMaskedLM,
TFXLNetModel, TFElectraForPreTraining,
TFXLNetLMHeadModel, TFElectraForQuestionAnswering,
TFXLNetForSequenceClassification, TFElectraForTokenClassification,
TFXLNetForTokenClassification, TFElectraModel,
TFXLNetForQuestionAnsweringSimple, TFElectraPreTrainedModel,
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
from .modeling_tf_xlm import ( from .modeling_tf_flaubert import (
TFXLMPreTrainedModel, TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXLMMainLayer, TFFlaubertForMultipleChoice,
TFXLMModel, TFFlaubertForQuestionAnsweringSimple,
TFXLMWithLMHeadModel, TFFlaubertForSequenceClassification,
TFXLMForSequenceClassification, TFFlaubertForTokenClassification,
TFXLMForQuestionAnsweringSimple, TFFlaubertWithLMHeadModel,
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, TFFlaubertModel,
) )
from .modeling_tf_xlm_roberta import ( from .modeling_tf_gpt2 import (
TFXLMRobertaForMaskedLM, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
TFXLMRobertaModel, TFGPT2DoubleHeadsModel,
TFXLMRobertaForSequenceClassification, TFGPT2LMHeadModel,
TFXLMRobertaForTokenClassification, TFGPT2MainLayer,
TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, TFGPT2Model,
TFGPT2PreTrainedModel,
)
from .modeling_tf_openai import (
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFOpenAIGPTDoubleHeadsModel,
TFOpenAIGPTLMHeadModel,
TFOpenAIGPTMainLayer,
TFOpenAIGPTModel,
TFOpenAIGPTPreTrainedModel,
) )
from .modeling_tf_roberta import ( from .modeling_tf_roberta import (
TFRobertaPreTrainedModel, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRobertaMainLayer,
TFRobertaModel,
TFRobertaForMaskedLM, TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
TFRobertaForQuestionAnswering,
TFRobertaForSequenceClassification, TFRobertaForSequenceClassification,
TFRobertaForTokenClassification, TFRobertaForTokenClassification,
TFRobertaForQuestionAnswering, TFRobertaMainLayer,
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, TFRobertaModel,
) TFRobertaPreTrainedModel,
from .modeling_tf_camembert import (
TFCamembertModel,
TFCamembertForMaskedLM,
TFCamembertForSequenceClassification,
TFCamembertForTokenClassification,
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
)
from .modeling_tf_flaubert import (
TFFlaubertModel,
TFFlaubertWithLMHeadModel,
TFFlaubertForSequenceClassification,
TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
from .modeling_tf_distilbert import ( from .modeling_tf_t5 import (
TFDistilBertPreTrainedModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST,
TFDistilBertMainLayer, TFT5ForConditionalGeneration,
TFDistilBertModel, TFT5Model,
TFDistilBertForMaskedLM, TFT5PreTrainedModel,
TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification,
TFDistilBertForQuestionAnswering,
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
from .modeling_tf_ctrl import ( from .modeling_tf_transfo_xl import (
TFCTRLPreTrainedModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCTRLModel, TFAdaptiveEmbedding,
TFCTRLLMHeadModel, TFTransfoXLLMHeadModel,
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST, TFTransfoXLMainLayer,
TFTransfoXLModel,
TFTransfoXLPreTrainedModel,
) )
from .modeling_tf_albert import ( from .modeling_tf_xlm import (
TFAlbertPreTrainedModel, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
TFAlbertMainLayer, TFXLMForMultipleChoice,
TFAlbertModel, TFXLMForQuestionAnsweringSimple,
TFAlbertForPreTraining, TFXLMForSequenceClassification,
TFAlbertForMaskedLM, TFXLMForTokenClassification,
TFAlbertForMultipleChoice, TFXLMWithLMHeadModel,
TFAlbertForSequenceClassification, TFXLMMainLayer,
TFAlbertForQuestionAnswering, TFXLMModel,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST, TFXLMPreTrainedModel,
) )
from .modeling_tf_t5 import ( from .modeling_tf_xlm_roberta import (
TFT5PreTrainedModel, TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFT5Model, TFXLMRobertaForMaskedLM,
TFT5ForConditionalGeneration, TFXLMRobertaForMultipleChoice,
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST, TFXLMRobertaForQuestionAnswering,
TFXLMRobertaForSequenceClassification,
TFXLMRobertaForTokenClassification,
TFXLMRobertaModel,
) )
from .modeling_tf_electra import ( from .modeling_tf_xlnet import (
TFElectraPreTrainedModel, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST,
TFElectraModel, TFXLNetForMultipleChoice,
TFElectraForPreTraining, TFXLNetForQuestionAnsweringSimple,
TFElectraForMaskedLM, TFXLNetForSequenceClassification,
TFElectraForTokenClassification, TFXLNetForTokenClassification,
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST, TFXLNetLMHeadModel,
TFXLNetMainLayer,
TFXLNetModel,
TFXLNetPreTrainedModel,
) )
# Optimization # Optimization
from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator from .optimization_tf import (
AdamWeightDecay,
create_optimizer,
GradientAccumulator,
WarmUp,
)
# Trainer # Trainer
from .trainer_tf import TFTrainer from .trainer_tf import TFTrainer
......
...@@ -394,8 +394,8 @@ def squad_convert_examples_to_features( ...@@ -394,8 +394,8 @@ def squad_convert_examples_to_features(
"qas_id": ex.qas_id, "qas_id": ex.qas_id,
}, },
{ {
"start_position": ex.start_position, "start_positions": ex.start_position,
"end_position": ex.end_position, "end_positions": ex.end_position,
"cls_index": ex.cls_index, "cls_index": ex.cls_index,
"p_mask": ex.p_mask, "p_mask": ex.p_mask,
"is_impossible": ex.is_impossible, "is_impossible": ex.is_impossible,
...@@ -412,8 +412,8 @@ def squad_convert_examples_to_features( ...@@ -412,8 +412,8 @@ def squad_convert_examples_to_features(
"qas_id": tf.string, "qas_id": tf.string,
}, },
{ {
"start_position": tf.int64, "start_positions": tf.int64,
"end_position": tf.int64, "end_positions": tf.int64,
"cls_index": tf.int64, "cls_index": tf.int64,
"p_mask": tf.int32, "p_mask": tf.int32,
"is_impossible": tf.int32, "is_impossible": tf.int32,
...@@ -429,8 +429,8 @@ def squad_convert_examples_to_features( ...@@ -429,8 +429,8 @@ def squad_convert_examples_to_features(
"qas_id": tf.TensorShape([]), "qas_id": tf.TensorShape([]),
}, },
{ {
"start_position": tf.TensorShape([]), "start_positions": tf.TensorShape([]),
"end_position": tf.TensorShape([]), "end_positions": tf.TensorShape([]),
"cls_index": tf.TensorShape([]), "cls_index": tf.TensorShape([]),
"p_mask": tf.TensorShape([None]), "p_mask": tf.TensorShape([None]),
"is_impossible": tf.TensorShape([]), "is_impossible": tf.TensorShape([]),
......
...@@ -81,6 +81,8 @@ class HfArgumentParser(ArgumentParser): ...@@ -81,6 +81,8 @@ class HfArgumentParser(ArgumentParser):
kwargs["type"] = field.type kwargs["type"] = field.type
if field.default is not dataclasses.MISSING: if field.default is not dataclasses.MISSING:
kwargs["default"] = field.default kwargs["default"] = field.default
elif field.default_factory is not dataclasses.MISSING:
kwargs["default"] = field.default_factory()
else: else:
kwargs["required"] = True kwargs["required"] = True
self.add_argument(field_name, **kwargs) self.add_argument(field_name, **kwargs)
......
...@@ -23,7 +23,16 @@ import tensorflow as tf ...@@ -23,7 +23,16 @@ import tensorflow as tf
from .configuration_albert import AlbertConfig from .configuration_albert import AlbertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -841,7 +850,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -841,7 +850,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -852,9 +861,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): ...@@ -852,9 +861,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`)
...@@ -878,27 +903,126 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): ...@@ -878,27 +903,126 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2') model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
logits = outputs[0] outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.albert(inputs, **kwargs)
outputs = self.albert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # logits, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings(
"""Albert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ALBERT_START_DOCSTRING,
)
class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.albert = TFAlbertMainLayer(config, name="albert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForTokenClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.albert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
"""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -908,9 +1032,32 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): ...@@ -908,9 +1032,32 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
...@@ -938,14 +1085,23 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): ...@@ -938,14 +1085,23 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2') model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids) all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
outputs = self.albert(inputs, **kwargs) outputs = self.albert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -956,7 +1112,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): ...@@ -956,7 +1112,13 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
outputs = (start_logits, end_logits,) + outputs[2:] outputs = (start_logits, end_logits,) + outputs[2:]
return outputs # start_logits, end_logits, (hidden_states), (attentions) if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -964,7 +1126,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel): ...@@ -964,7 +1126,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel):
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ALBERT_START_DOCSTRING, ALBERT_START_DOCSTRING,
) )
class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -992,9 +1154,15 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): ...@@ -992,9 +1154,15 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
...@@ -1019,12 +1187,13 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): ...@@ -1019,12 +1187,13 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2') model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
example1 = ["This is a context", "Is it a context? Yes"] loss, classification_scores = outputs[:2]
example2 = ["This is a context", "Is it a context? No"]
encoding = tokenizer.batch_encode_plus([example1, example2], return_tensors='tf', truncation_strategy="only_first", pad_to_max_length=True, max_length=128)
outputs = model(encoding["input_ids"][None, :])
logits = outputs[0]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -1036,10 +1205,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): ...@@ -1036,10 +1205,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs." assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
print("isdict(1)")
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
print(input_ids)
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids) token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids) position_ids = inputs.get("position_ids", position_ids)
...@@ -1080,4 +1246,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel): ...@@ -1080,4 +1246,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel):
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # reshaped_logits, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
...@@ -22,14 +22,18 @@ from .configuration_auto import ( ...@@ -22,14 +22,18 @@ from .configuration_auto import (
AlbertConfig, AlbertConfig,
AutoConfig, AutoConfig,
BertConfig, BertConfig,
CamembertConfig,
CTRLConfig, CTRLConfig,
DistilBertConfig, DistilBertConfig,
ElectraConfig,
FlaubertConfig,
GPT2Config, GPT2Config,
OpenAIGPTConfig, OpenAIGPTConfig,
RobertaConfig, RobertaConfig,
T5Config, T5Config,
TransfoXLConfig, TransfoXLConfig,
XLMConfig, XLMConfig,
XLMRobertaConfig,
XLNetConfig, XLNetConfig,
) )
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
...@@ -39,6 +43,7 @@ from .modeling_tf_albert import ( ...@@ -39,6 +43,7 @@ from .modeling_tf_albert import (
TFAlbertForPreTraining, TFAlbertForPreTraining,
TFAlbertForQuestionAnswering, TFAlbertForQuestionAnswering,
TFAlbertForSequenceClassification, TFAlbertForSequenceClassification,
TFAlbertForTokenClassification,
TFAlbertModel, TFAlbertModel,
) )
from .modeling_tf_bert import ( from .modeling_tf_bert import (
...@@ -50,18 +55,43 @@ from .modeling_tf_bert import ( ...@@ -50,18 +55,43 @@ from .modeling_tf_bert import (
TFBertForTokenClassification, TFBertForTokenClassification,
TFBertModel, TFBertModel,
) )
from .modeling_tf_camembert import (
TFCamembertForMaskedLM,
TFCamembertForMultipleChoice,
TFCamembertForQuestionAnswering,
TFCamembertForSequenceClassification,
TFCamembertForTokenClassification,
TFCamembertModel,
)
from .modeling_tf_ctrl import TFCTRLLMHeadModel, TFCTRLModel from .modeling_tf_ctrl import TFCTRLLMHeadModel, TFCTRLModel
from .modeling_tf_distilbert import ( from .modeling_tf_distilbert import (
TFDistilBertForMaskedLM, TFDistilBertForMaskedLM,
TFDistilBertForMultipleChoice,
TFDistilBertForQuestionAnswering, TFDistilBertForQuestionAnswering,
TFDistilBertForSequenceClassification, TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification, TFDistilBertForTokenClassification,
TFDistilBertModel, TFDistilBertModel,
) )
from .modeling_tf_electra import (
TFElectraForMaskedLM,
TFElectraForPreTraining,
TFElectraForQuestionAnswering,
TFElectraForTokenClassification,
TFElectraModel,
)
from .modeling_tf_flaubert import (
TFFlaubertForMultipleChoice,
TFFlaubertForQuestionAnsweringSimple,
TFFlaubertForSequenceClassification,
TFFlaubertForTokenClassification,
TFFlaubertModel,
TFFlaubertWithLMHeadModel,
)
from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model from .modeling_tf_gpt2 import TFGPT2LMHeadModel, TFGPT2Model
from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel from .modeling_tf_openai import TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
from .modeling_tf_roberta import ( from .modeling_tf_roberta import (
TFRobertaForMaskedLM, TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
TFRobertaForQuestionAnswering, TFRobertaForQuestionAnswering,
TFRobertaForSequenceClassification, TFRobertaForSequenceClassification,
TFRobertaForTokenClassification, TFRobertaForTokenClassification,
...@@ -70,12 +100,23 @@ from .modeling_tf_roberta import ( ...@@ -70,12 +100,23 @@ from .modeling_tf_roberta import (
from .modeling_tf_t5 import TFT5ForConditionalGeneration, TFT5Model from .modeling_tf_t5 import TFT5ForConditionalGeneration, TFT5Model
from .modeling_tf_transfo_xl import TFTransfoXLLMHeadModel, TFTransfoXLModel from .modeling_tf_transfo_xl import TFTransfoXLLMHeadModel, TFTransfoXLModel
from .modeling_tf_xlm import ( from .modeling_tf_xlm import (
TFXLMForMultipleChoice,
TFXLMForQuestionAnsweringSimple, TFXLMForQuestionAnsweringSimple,
TFXLMForSequenceClassification, TFXLMForSequenceClassification,
TFXLMForTokenClassification,
TFXLMModel, TFXLMModel,
TFXLMWithLMHeadModel, TFXLMWithLMHeadModel,
) )
from .modeling_tf_xlm_roberta import (
TFXLMRobertaForMaskedLM,
TFXLMRobertaForMultipleChoice,
TFXLMRobertaForQuestionAnswering,
TFXLMRobertaForSequenceClassification,
TFXLMRobertaForTokenClassification,
TFXLMRobertaModel,
)
from .modeling_tf_xlnet import ( from .modeling_tf_xlnet import (
TFXLNetForMultipleChoice,
TFXLNetForQuestionAnsweringSimple, TFXLNetForQuestionAnsweringSimple,
TFXLNetForSequenceClassification, TFXLNetForSequenceClassification,
TFXLNetForTokenClassification, TFXLNetForTokenClassification,
...@@ -89,83 +130,118 @@ logger = logging.getLogger(__name__) ...@@ -89,83 +130,118 @@ logger = logging.getLogger(__name__)
TF_MODEL_MAPPING = OrderedDict( TF_MODEL_MAPPING = OrderedDict(
[ [
(T5Config, TFT5Model),
(DistilBertConfig, TFDistilBertModel),
(AlbertConfig, TFAlbertModel), (AlbertConfig, TFAlbertModel),
(RobertaConfig, TFRobertaModel),
(BertConfig, TFBertModel), (BertConfig, TFBertModel),
(OpenAIGPTConfig, TFOpenAIGPTModel), (CamembertConfig, TFCamembertModel),
(CTRLConfig, TFCTRLModel),
(DistilBertConfig, TFDistilBertModel),
(ElectraConfig, TFElectraModel),
(FlaubertConfig, TFFlaubertModel),
(GPT2Config, TFGPT2Model), (GPT2Config, TFGPT2Model),
(OpenAIGPTConfig, TFOpenAIGPTModel),
(RobertaConfig, TFRobertaModel),
(T5Config, TFT5Model),
(TransfoXLConfig, TFTransfoXLModel), (TransfoXLConfig, TFTransfoXLModel),
(XLNetConfig, TFXLNetModel),
(XLMConfig, TFXLMModel), (XLMConfig, TFXLMModel),
(CTRLConfig, TFCTRLModel), (XLMRobertaConfig, TFXLMRobertaModel),
(XLNetConfig, TFXLNetModel),
] ]
) )
TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
[ [
(T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForPreTraining), (AlbertConfig, TFAlbertForPreTraining),
(RobertaConfig, TFRobertaForMaskedLM),
(BertConfig, TFBertForPreTraining), (BertConfig, TFBertForPreTraining),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (CamembertConfig, TFCamembertForMaskedLM),
(CTRLConfig, TFCTRLLMHeadModel),
(DistilBertConfig, TFDistilBertForMaskedLM),
(ElectraConfig, TFElectraForPreTraining),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel), (GPT2Config, TFGPT2LMHeadModel),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(RobertaConfig, TFRobertaForMaskedLM),
(T5Config, TFT5ForConditionalGeneration),
(TransfoXLConfig, TFTransfoXLLMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel),
(XLNetConfig, TFXLNetLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel),
(CTRLConfig, TFCTRLLMHeadModel), (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(XLNetConfig, TFXLNetLMHeadModel),
] ]
) )
TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
[ [
(T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM),
(RobertaConfig, TFRobertaForMaskedLM),
(BertConfig, TFBertForMaskedLM), (BertConfig, TFBertForMaskedLM),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (CamembertConfig, TFCamembertForMaskedLM),
(CTRLConfig, TFCTRLLMHeadModel),
(DistilBertConfig, TFDistilBertForMaskedLM),
(ElectraConfig, TFElectraForMaskedLM),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel), (GPT2Config, TFGPT2LMHeadModel),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(RobertaConfig, TFRobertaForMaskedLM),
(T5Config, TFT5ForConditionalGeneration),
(TransfoXLConfig, TFTransfoXLLMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel),
(XLNetConfig, TFXLNetLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel), (XLMConfig, TFXLMWithLMHeadModel),
(CTRLConfig, TFCTRLLMHeadModel), (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(XLNetConfig, TFXLNetLMHeadModel),
] ]
) )
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[ [
(DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForMultipleChoice),
(AlbertConfig, TFAlbertForSequenceClassification), (BertConfig, TFBertForMultipleChoice),
(RobertaConfig, TFRobertaForSequenceClassification), (CamembertConfig, TFCamembertForMultipleChoice),
(BertConfig, TFBertForSequenceClassification), (DistilBertConfig, TFDistilBertForMultipleChoice),
(XLNetConfig, TFXLNetForSequenceClassification), (FlaubertConfig, TFFlaubertForMultipleChoice),
(XLMConfig, TFXLMForSequenceClassification), (RobertaConfig, TFRobertaForMultipleChoice),
(XLMConfig, TFXLMForMultipleChoice),
(XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
(XLNetConfig, TFXLNetForMultipleChoice),
] ]
) )
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[(BertConfig, TFBertForMultipleChoice), (AlbertConfig, TFAlbertForMultipleChoice)]
)
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[ [
(DistilBertConfig, TFDistilBertForQuestionAnswering),
(AlbertConfig, TFAlbertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering),
(RobertaConfig, TFRobertaForQuestionAnswering),
(BertConfig, TFBertForQuestionAnswering), (BertConfig, TFBertForQuestionAnswering),
(XLNetConfig, TFXLNetForQuestionAnsweringSimple), (CamembertConfig, TFCamembertForQuestionAnswering),
(DistilBertConfig, TFDistilBertForQuestionAnswering),
(ElectraConfig, TFElectraForQuestionAnswering),
(FlaubertConfig, TFFlaubertForQuestionAnsweringSimple),
(RobertaConfig, TFRobertaForQuestionAnswering),
(XLMConfig, TFXLMForQuestionAnsweringSimple), (XLMConfig, TFXLMForQuestionAnsweringSimple),
(XLMRobertaConfig, TFXLMRobertaForQuestionAnswering),
(XLNetConfig, TFXLNetForQuestionAnsweringSimple),
]
)
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
[
(AlbertConfig, TFAlbertForSequenceClassification),
(BertConfig, TFBertForSequenceClassification),
(CamembertConfig, TFCamembertForSequenceClassification),
(DistilBertConfig, TFDistilBertForSequenceClassification),
(FlaubertConfig, TFFlaubertForSequenceClassification),
(RobertaConfig, TFRobertaForSequenceClassification),
(XLMConfig, TFXLMForSequenceClassification),
(XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
(XLNetConfig, TFXLNetForSequenceClassification),
] ]
) )
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
[ [
(AlbertConfig, TFAlbertForTokenClassification),
(BertConfig, TFBertForTokenClassification),
(CamembertConfig, TFCamembertForTokenClassification),
(DistilBertConfig, TFDistilBertForTokenClassification), (DistilBertConfig, TFDistilBertForTokenClassification),
(ElectraConfig, TFElectraForTokenClassification),
(FlaubertConfig, TFFlaubertForTokenClassification),
(RobertaConfig, TFRobertaForTokenClassification), (RobertaConfig, TFRobertaForTokenClassification),
(BertConfig, TFBertForTokenClassification), (XLMConfig, TFXLMForTokenClassification),
(XLMRobertaConfig, TFXLMRobertaForTokenClassification),
(XLNetConfig, TFXLNetForTokenClassification), (XLNetConfig, TFXLNetForTokenClassification),
] ]
) )
...@@ -632,11 +708,13 @@ class TFAutoModelWithLMHead(object): ...@@ -632,11 +708,13 @@ class TFAutoModelWithLMHead(object):
""" """
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
if not isinstance(config, PretrainedConfig): if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
if isinstance(config, config_class): # Not using isinstance() here to do not take into account inheritance
if config_class == type(config):
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
raise ValueError( raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
......
...@@ -23,7 +23,16 @@ import tensorflow as tf ...@@ -23,7 +23,16 @@ import tensorflow as tf
from .configuration_bert import BertConfig from .configuration_bert import BertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -880,7 +889,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -880,7 +889,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForSequenceClassification(TFBertPreTrainedModel): class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -891,9 +900,25 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -891,9 +900,25 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
...@@ -916,21 +941,35 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -916,21 +941,35 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
logits = outputs[0] outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.bert(inputs, **kwargs)
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
pooled_output = outputs[1] pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output) logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # logits, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -938,7 +977,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): ...@@ -938,7 +977,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForMultipleChoice(TFBertPreTrainedModel): class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -966,9 +1005,15 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -966,9 +1005,15 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`: classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
...@@ -993,15 +1038,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -993,15 +1038,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased') model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
choice0 = "It is eaten with a fork and a knife." labels = tf.reshape(tf.constant(1), (-1, 1))
choice1 = "It is eaten while held in the hand." outputs = model(input_ids, labels=labels)
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='tf', pad_to_max_length=True)
loss, classification_scores = outputs[:2]
# linear classifier on the output is not yet trained
outputs = model(encoding['input_ids'][None, :])
logits = outputs[0]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1011,7 +1055,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1011,7 +1055,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
head_mask = inputs[4] if len(inputs) > 4 else head_mask head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs." assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids) token_type_ids = inputs.get("token_type_ids", token_type_ids)
...@@ -1053,7 +1097,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1053,7 +1097,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # reshaped_logits, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -1061,7 +1109,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): ...@@ -1061,7 +1109,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForTokenClassification(TFBertPreTrainedModel): class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1072,9 +1120,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1072,9 +1120,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
...@@ -1098,20 +1160,33 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1098,20 +1160,33 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
scores = outputs[0] outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.bert(inputs, **kwargs) outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # scores, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -1119,7 +1194,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): ...@@ -1119,7 +1194,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
BERT_START_DOCSTRING, BERT_START_DOCSTRING,
) )
class TFBertForQuestionAnswering(TFBertPreTrainedModel): class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1129,9 +1204,32 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): ...@@ -1129,9 +1204,32 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
...@@ -1156,18 +1254,24 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): ...@@ -1156,18 +1254,24 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text) input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] start_scores, end_scores = model(input_dict)
start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])
all_tokens = tokenizer.convert_ids_to_tokens(input_ids) all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1]) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet" assert answer == "a nice puppet"
""" """
outputs = self.bert(inputs, **kwargs) outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1178,4 +1282,10 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): ...@@ -1178,4 +1282,10 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
outputs = (start_logits, end_logits,) + outputs[2:] outputs = (start_logits, end_logits,) + outputs[2:]
return outputs # start_logits, end_logits, (hidden_states), (attentions) if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
...@@ -22,6 +22,8 @@ from .configuration_camembert import CamembertConfig ...@@ -22,6 +22,8 @@ from .configuration_camembert import CamembertConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_roberta import ( from .modeling_tf_roberta import (
TFRobertaForMaskedLM, TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
TFRobertaForQuestionAnswering,
TFRobertaForSequenceClassification, TFRobertaForSequenceClassification,
TFRobertaForTokenClassification, TFRobertaForTokenClassification,
TFRobertaModel, TFRobertaModel,
...@@ -114,3 +116,30 @@ class TFCamembertForTokenClassification(TFRobertaForTokenClassification): ...@@ -114,3 +116,30 @@ class TFCamembertForTokenClassification(TFRobertaForTokenClassification):
""" """
config_class = CamembertConfig config_class = CamembertConfig
@add_start_docstrings(
"""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
CAMEMBERT_START_DOCSTRING,
)
class TFCamembertForMultipleChoice(TFRobertaForMultipleChoice):
"""
This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples.
"""
config_class = CamembertConfig
@add_start_docstrings(
"""CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
CAMEMBERT_START_DOCSTRING,
)
class TFCamembertForQuestionAnswering(TFRobertaForQuestionAnswering):
"""
This class overrides :class:`~transformers.TFRobertaForQuestionAnswering`. Please check the
superclass for the appropriate documentation alongside usage examples.
"""
config_class = CamembertConfig
...@@ -23,8 +23,18 @@ import numpy as np ...@@ -23,8 +23,18 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, get_initializer, shape_list from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSharedEmbeddings,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -399,7 +409,10 @@ class TFTransformer(tf.keras.layers.Layer): ...@@ -399,7 +409,10 @@ class TFTransformer(tf.keras.layers.Layer):
return outputs # last-layer hidden state, (all hidden states), (all attentions) return outputs # last-layer hidden state, (all hidden states), (all attentions)
@keras_serializable
class TFDistilBertMainLayer(tf.keras.layers.Layer): class TFDistilBertMainLayer(tf.keras.layers.Layer):
config_class = DistilBertConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
...@@ -662,7 +675,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -662,7 +675,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -680,8 +693,16 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -680,8 +693,16 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
...@@ -705,20 +726,32 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -705,20 +726,32 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
logits = outputs[0] outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
distilbert_output = self.distilbert(inputs, **kwargs) distilbert_output = self.distilbert(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
hidden_state = distilbert_output[0] # (bs, seq_len, dim) hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim) pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim) pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) # (bs, dim) pooled_output = self.dropout(pooled_output, training=training) # (bs, dim)
logits = self.classifier(pooled_output) # (bs, dim) logits = self.classifier(pooled_output) # (bs, dim)
outputs = (logits,) + distilbert_output[1:] outputs = (logits,) + distilbert_output[1:]
return outputs # logits, (hidden_states), (attentions)
if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -726,7 +759,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): ...@@ -726,7 +759,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -738,8 +771,14 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): ...@@ -738,8 +771,14 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
) )
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
...@@ -762,20 +801,154 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): ...@@ -762,20 +801,154 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased') model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
scores = outputs[0] outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.distilbert(inputs, **kwargs) outputs = self.distilbert(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # scores, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings(
"""DistilBert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
DISTILBERT_START_DOCSTRING,
)
class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.pre_classifier = tf.keras.layers.Dense(
config.dim,
kernel_initializer=get_initializer(config.initializer_range),
activation="relu",
name="pre_classifier",
)
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def call(
self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
head_mask = inputs[2] if len(inputs) > 2 else head_mask
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 4, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_inputs = [
flat_input_ids,
flat_attention_mask,
head_mask,
inputs_embeds,
]
distilbert_output = self.distilbert(flat_inputs, training=training)
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
pooled_output = hidden_state[:, 0] # (bs, dim)
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
pooled_output = self.dropout(pooled_output, training=training) # (bs, dim)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + distilbert_output[1:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -783,7 +956,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): ...@@ -783,7 +956,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
DISTILBERT_START_DOCSTRING, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -795,8 +968,29 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): ...@@ -795,8 +968,29 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
self.dropout = tf.keras.layers.Dropout(config.qa_dropout) self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
...@@ -821,19 +1015,35 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): ...@@ -821,19 +1015,35 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
outputs = model(input_ids) input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = outputs[:2] start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
distilbert_output = self.distilbert(inputs, **kwargs) distilbert_output = self.distilbert(
input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
hidden_states = distilbert_output[0] # (bs, max_query_len, dim) hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False)) # (bs, max_query_len, dim) hidden_states = self.dropout(hidden_states, training=training) # (bs, max_query_len, dim)
logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2)
start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + distilbert_output[1:] outputs = (start_logits, end_logits,) + distilbert_output[1:]
return outputs # start_logits, end_logits, (hidden_states), (attentions)
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
...@@ -6,7 +6,13 @@ from transformers import ElectraConfig ...@@ -6,7 +6,13 @@ from transformers import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import get_initializer, shape_list from .modeling_tf_utils import (
TFQuestionAnsweringLoss,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -194,6 +200,7 @@ class TFElectraPreTrainedModel(TFBertPreTrainedModel): ...@@ -194,6 +200,7 @@ class TFElectraPreTrainedModel(TFBertPreTrainedModel):
return head_mask return head_mask
@keras_serializable
class TFElectraMainLayer(TFElectraPreTrainedModel): class TFElectraMainLayer(TFElectraPreTrainedModel):
config_class = ElectraConfig config_class = ElectraConfig
...@@ -557,13 +564,15 @@ Electra model with a token classification head on top. ...@@ -557,13 +564,15 @@ Electra model with a token classification head on top.
Both the discriminator and generator may be loaded into this model.""", Both the discriminator and generator may be loaded into this model.""",
ELECTRA_START_DOCSTRING, ELECTRA_START_DOCSTRING,
) )
class TFElectraForTokenClassification(TFElectraPreTrainedModel): class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra") self.electra = TFElectraMainLayer(config, name="electra")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call( def call(
...@@ -574,9 +583,14 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel): ...@@ -574,9 +583,14 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
...@@ -599,9 +613,11 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel): ...@@ -599,9 +613,11 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator') model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
scores = outputs[0] outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
...@@ -610,7 +626,106 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel): ...@@ -610,7 +626,106 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel):
discriminator_sequence_output = discriminator_hidden_states[0] discriminator_sequence_output = discriminator_hidden_states[0]
discriminator_sequence_output = self.dropout(discriminator_sequence_output) discriminator_sequence_output = self.dropout(discriminator_sequence_output)
logits = self.classifier(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output)
output = (logits,)
output += discriminator_hidden_states[1:]
return output # (loss), scores, (hidden_states), (attentions) outputs = (logits,) + discriminator_hidden_states[1:]
if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), scores, (hidden_states), (attentions)
@add_start_docstrings(
"""Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
ELECTRA_START_DOCSTRING,
)
class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.electra = TFElectraMainLayer(config, name="electra")
self.qa_outputs = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
)
discriminator_sequence_output = discriminator_hidden_states[0]
logits = self.qa_outputs(discriminator_sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + discriminator_hidden_states[1:]
if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
...@@ -22,13 +22,16 @@ import tensorflow as tf ...@@ -22,13 +22,16 @@ import tensorflow as tf
from .configuration_flaubert import FlaubertConfig from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_utils import keras_serializable, shape_list
from .modeling_tf_xlm import ( from .modeling_tf_xlm import (
TFXLMForMultipleChoice,
TFXLMForQuestionAnsweringSimple,
TFXLMForSequenceClassification, TFXLMForSequenceClassification,
TFXLMForTokenClassification,
TFXLMMainLayer, TFXLMMainLayer,
TFXLMModel, TFXLMModel,
TFXLMWithLMHeadModel, TFXLMWithLMHeadModel,
get_masks, get_masks,
shape_list,
) )
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -112,6 +115,7 @@ class TFFlaubertModel(TFXLMModel): ...@@ -112,6 +115,7 @@ class TFFlaubertModel(TFXLMModel):
self.transformer = TFFlaubertMainLayer(config, name="transformer") self.transformer = TFFlaubertMainLayer(config, name="transformer")
@keras_serializable
class TFFlaubertMainLayer(TFXLMMainLayer): class TFFlaubertMainLayer(TFXLMMainLayer):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -327,3 +331,38 @@ class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification): ...@@ -327,3 +331,38 @@ class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer") self.transformer = TFFlaubertMainLayer(config, name="transformer")
@add_start_docstrings(
"""Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForQuestionAnsweringSimple(TFXLMForQuestionAnsweringSimple):
config_class = FlaubertConfig
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
@add_start_docstrings(
"""Flaubert Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForTokenClassification(TFXLMForTokenClassification):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
@add_start_docstrings(
"""Flaubert Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
FLAUBERT_START_DOCSTRING,
)
class TFFlaubertForMultipleChoice(TFXLMForMultipleChoice):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFFlaubertMainLayer(config, name="transformer")
...@@ -29,6 +29,7 @@ from .modeling_tf_utils import ( ...@@ -29,6 +29,7 @@ from .modeling_tf_utils import (
TFSequenceSummary, TFSequenceSummary,
TFSharedEmbeddings, TFSharedEmbeddings,
get_initializer, get_initializer,
keras_serializable,
shape_list, shape_list,
) )
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -199,7 +200,10 @@ class TFBlock(tf.keras.layers.Layer): ...@@ -199,7 +200,10 @@ class TFBlock(tf.keras.layers.Layer):
return outputs # x, (attentions) return outputs # x, (attentions)
@keras_serializable
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
config_class = OpenAIGPTConfig
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs) super().__init__(*inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
......
...@@ -21,9 +21,18 @@ import logging ...@@ -21,9 +21,18 @@ import logging
import tensorflow as tf import tensorflow as tf
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -82,6 +91,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -82,6 +91,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
@keras_serializable
class TFRobertaMainLayer(TFBertMainLayer): class TFRobertaMainLayer(TFBertMainLayer):
""" """
Same as TFBertMainLayer but uses TFRobertaEmbeddings. Same as TFBertMainLayer but uses TFRobertaEmbeddings.
...@@ -337,7 +347,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): ...@@ -337,7 +347,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
on top of the pooled output) e.g. for GLUE tasks. """, on top of the pooled output) e.g. for GLUE tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -346,7 +356,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -346,7 +356,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
self.classifier = TFRobertaClassificationHead(config, name="classifier") self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
...@@ -370,20 +390,164 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -370,20 +390,164 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base') model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.constant([1])[None, :] # Batch size 1 labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids) outputs = model(input_ids, labels=labels)
logits = outputs[0] loss, logits = outputs[:2]
""" """
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output, training=training)
outputs = (logits,) + outputs[2:] outputs = (logits,) + outputs[2:]
return outputs # logits, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings(
"""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ROBERTA_START_DOCSTRING,
)
class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFBertMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [
flat_input_ids,
flat_attention_mask,
flat_token_type_ids,
flat_position_ids,
head_mask,
inputs_embeds,
]
outputs = self.roberta(flat_inputs, training=training)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output, training=training)
logits = self.classifier(pooled_output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -391,7 +555,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): ...@@ -391,7 +555,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -403,8 +567,22 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): ...@@ -403,8 +567,22 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
...@@ -428,27 +606,40 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): ...@@ -428,27 +606,40 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base') model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
scores = outputs[0] outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
return outputs # scores, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
"""RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, """RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
ROBERTA_START_DOCSTRING, ROBERTA_START_DOCSTRING,
) )
class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -459,8 +650,31 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): ...@@ -459,8 +650,31 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
...@@ -488,14 +702,23 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): ...@@ -488,14 +702,23 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base') model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
input_ids = tokenizer.encode("Who was Jim Henson?", "Jim Henson was a nice puppet") question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
start_scores, end_scores = model(tf.constant(input_ids)[None, :]) # Batch size 1 input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids) all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1]) answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -506,4 +729,10 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel): ...@@ -506,4 +729,10 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel):
outputs = (start_logits, end_logits,) + outputs[2:] outputs = (start_logits, end_logits,) + outputs[2:]
return outputs # start_logits, end_logits, (hidden_states), (attentions) if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
...@@ -25,7 +25,8 @@ import tensorflow as tf ...@@ -25,7 +25,8 @@ import tensorflow as tf
from .configuration_t5 import T5Config from .configuration_t5 import T5Config
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, keras_serializable, shape_list
from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -502,7 +503,10 @@ class _NoLayerEmbedTokens(object): ...@@ -502,7 +503,10 @@ class _NoLayerEmbedTokens(object):
# The full model without a specific pretrained or finetuning head is # The full model without a specific pretrained or finetuning head is
# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer" # provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
#################################################### ####################################################
@keras_serializable
class TFT5MainLayer(tf.keras.layers.Layer): class TFT5MainLayer(tf.keras.layers.Layer):
config_class = T5Config
def __init__(self, config, embed_tokens=None, **kwargs): def __init__(self, config, embed_tokens=None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -548,12 +552,32 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -548,12 +552,32 @@ class TFT5MainLayer(tf.keras.layers.Layer):
use_cache=False, use_cache=False,
training=False, training=False,
): ):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
encoder_hidden_states = inputs[2] if len(inputs) > 2 else encoder_hidden_states
encoder_attention_mask = inputs[3] if len(inputs) > 3 else encoder_attention_mask
inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds
head_mask = inputs[5] if len(inputs) > 5 else head_mask
past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("decoder_input_ids")
attention_mask = inputs.get("decoder_attention_mask", attention_mask)
encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states)
encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
head_mask = inputs.get("head_mask", head_mask)
past_key_value_states = inputs.get("past_key_value_states", past_key_value_states)
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs
if inputs is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both inputs and inputs_embeds at the same time") raise ValueError("You cannot specify both inputs and inputs_embeds at the same time")
elif inputs is not None: elif input_ids is not None:
input_shape = shape_list(inputs) input_shape = shape_list(input_ids)
inputs = tf.reshape(inputs, (-1, input_shape[-1])) input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
elif inputs_embeds is not None: elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1] input_shape = shape_list(inputs_embeds)[:-1]
else: else:
...@@ -561,7 +585,7 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -561,7 +585,7 @@ class TFT5MainLayer(tf.keras.layers.Layer):
if inputs_embeds is None: if inputs_embeds is None:
assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings" assert self.embed_tokens is not None, "You have to intialize the model with valid token embeddings"
inputs_embeds = self.embed_tokens(inputs) inputs_embeds = self.embed_tokens(input_ids)
batch_size, seq_length = input_shape batch_size, seq_length = input_shape
......
...@@ -734,7 +734,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -734,7 +734,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
return outputs return outputs
class TFTransfoXLLMHead(tf.keras.layers.Layer): class TFTransfoXLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs): def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
......
...@@ -84,6 +84,7 @@ def keras_serializable(cls): ...@@ -84,6 +84,7 @@ def keras_serializable(cls):
else: else:
raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)") raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)")
self._transformers_config = config self._transformers_config = config
self._kwargs = kwargs
cls.__init__ = wrapped_init cls.__init__ = wrapped_init
...@@ -94,6 +95,7 @@ def keras_serializable(cls): ...@@ -94,6 +95,7 @@ def keras_serializable(cls):
def get_config(self): def get_config(self):
cfg = super(cls, self).get_config() cfg = super(cls, self).get_config()
cfg["transformers_config"] = self._transformers_config.to_dict() cfg["transformers_config"] = self._transformers_config.to_dict()
cfg.update(self._kwargs)
return cfg return cfg
cls.get_config = get_config cls.get_config = get_config
...@@ -104,6 +106,44 @@ def keras_serializable(cls): ...@@ -104,6 +106,44 @@ def keras_serializable(cls):
return cls return cls
class TFQuestionAnsweringLoss:
def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
start_loss = loss_fn(labels["start_position"], logits[0])
end_loss = loss_fn(labels["end_position"], logits[1])
return (start_loss + end_loss) / 2.0
class TFTokenClassificationLoss:
def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
active_loss = tf.reshape(labels, (-1,)) != -1
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
class TFSequenceClassificationLoss:
def compute_loss(self, labels, logits):
if shape_list(logits)[1] == 1:
loss_fn = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
else:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
return loss_fn(labels, logits)
TFMultipleChoiceLoss = TFSequenceClassificationLoss
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
r""" Base class for all TF models. r""" Base class for all TF models.
...@@ -1531,6 +1571,16 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): ...@@ -1531,6 +1571,16 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
) )
super().build(input_shape) super().build(input_shape)
def get_config(self):
config = {
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"initializer_range": self.initializer_range,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs, mode="embedding"): def call(self, inputs, mode="embedding"):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
......
...@@ -24,8 +24,19 @@ import numpy as np ...@@ -24,8 +24,19 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, get_initializer, shape_list from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSequenceSummary,
TFSharedEmbeddings,
TFTokenClassificationLoss,
get_initializer,
keras_serializable,
shape_list,
)
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
...@@ -198,7 +209,10 @@ class TFTransformerFFN(tf.keras.layers.Layer): ...@@ -198,7 +209,10 @@ class TFTransformerFFN(tf.keras.layers.Layer):
return x return x
@keras_serializable
class TFXLMMainLayer(tf.keras.layers.Layer): class TFXLMMainLayer(tf.keras.layers.Layer):
config_class = XLMConfig
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -717,7 +731,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -717,7 +731,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForSequenceClassification(TFXLMPreTrainedModel): class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -726,8 +740,27 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -726,8 +740,27 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`):
...@@ -751,19 +784,261 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -751,19 +784,261 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.constant([1])[None, :] # Batch size 1 labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids) outputs = model(input_ids, labels=labels)
logits = outputs[0] loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings(
"""XLM Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
XLM_START_DOCSTRING,
)
class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
def call(
self,
inputs,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForMultipleChoice
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
langs = inputs[2] if len(inputs) > 2 else langs
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids
lengths = inputs[5] if len(inputs) > 5 else lengths
cache = inputs[6] if len(inputs) > 6 else cache
head_mask = inputs[7] if len(inputs) > 7 else head_mask
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
langs = inputs.get("langs", langs)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
lengths = inputs.get("lengths", lengths)
cache = inputs.get("cache", cache)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 9, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [
flat_input_ids,
flat_attention_mask,
langs,
flat_token_type_ids,
flat_position_ids,
lengths,
cache,
head_mask,
inputs_embeds,
]
transformer_outputs = self.transformer(flat_inputs, training=training)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
@add_start_docstrings(
"""XLM Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XLM_START_DOCSTRING,
)
class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name="transformer")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForTokenClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = transformer_outputs[0]
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -771,7 +1046,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): ...@@ -771,7 +1046,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_START_DOCSTRING,
) )
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
...@@ -780,8 +1055,34 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): ...@@ -780,8 +1055,34 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
langs=None,
token_type_ids=None,
position_ids=None,
lengths=None,
cache=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs:
start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): start_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`):
...@@ -807,12 +1108,27 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): ...@@ -807,12 +1108,27 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
outputs = model(input_ids) input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = outputs[:2] start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
transformer_outputs = self.transformer(inputs, **kwargs)
transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -825,4 +1141,10 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): ...@@ -825,4 +1141,10 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
1: 1:
] # Keep mems, hidden states, attentions if there are in it ] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (hidden_states), (attentions) if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
...@@ -22,6 +22,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig ...@@ -22,6 +22,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_roberta import ( from .modeling_tf_roberta import (
TFRobertaForMaskedLM, TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
TFRobertaForQuestionAnswering,
TFRobertaForSequenceClassification, TFRobertaForSequenceClassification,
TFRobertaForTokenClassification, TFRobertaForTokenClassification,
TFRobertaModel, TFRobertaModel,
...@@ -114,3 +116,30 @@ class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification): ...@@ -114,3 +116,30 @@ class TFXLMRobertaForTokenClassification(TFRobertaForTokenClassification):
""" """
config_class = XLMRobertaConfig config_class = XLMRobertaConfig
@add_start_docstrings(
"""XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_ROBERTA_START_DOCSTRING,
)
class TFXLMRobertaForQuestionAnswering(TFRobertaForQuestionAnswering):
"""
This class overrides :class:`~transformers.TFRobertaForQuestionAnsweringSimple`. Please check the
superclass for the appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
@add_start_docstrings(
"""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
XLM_ROBERTA_START_DOCSTRING,
)
class TFXLMRobertaForMultipleChoice(TFRobertaForMultipleChoice):
"""
This class overrides :class:`~transformers.TFRobertaForMultipleChoice`. Please check the
superclass for the appropriate documentation alongside usage examples.
"""
config_class = XLMRobertaConfig
...@@ -23,11 +23,15 @@ import numpy as np ...@@ -23,11 +23,15 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss,
TFSequenceClassificationLoss,
TFSequenceSummary, TFSequenceSummary,
TFSharedEmbeddings, TFSharedEmbeddings,
TFTokenClassificationLoss,
get_initializer, get_initializer,
keras_serializable, keras_serializable,
shape_list, shape_list,
...@@ -938,7 +942,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -938,7 +942,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -952,8 +956,28 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -952,8 +956,28 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
...@@ -981,12 +1005,24 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -981,12 +1005,24 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased') model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
logits = outputs[0] outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
)
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
...@@ -994,7 +1030,159 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -994,7 +1030,159 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, (mems), (hidden states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings(
"""XLNET Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
XLNET_START_DOCSTRING,
)
class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer")
self.sequence_summary = TFSequenceSummary(
config, initializer_range=config.initializer_range, name="sequence_summary"
)
self.logits_proj = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
)
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
def call(
self,
inputs,
token_type_ids=None,
input_mask=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
`num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
mems = inputs[2] if len(inputs) > 2 else mems
perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
input_mask = inputs[6] if len(inputs) > 6 else input_mask
head_mask = inputs[7] if len(inputs) > 7 else head_mask
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
use_cache = inputs[9] if len(inputs) > 9 else use_cache
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
mems = inputs.get("mems", mems)
perm_mask = inputs.get("perm_mask", perm_mask)
target_mapping = inputs.get("target_mapping", target_mapping)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
input_mask = inputs.get("input_mask", input_mask)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
use_cache = inputs.get("use_cache", use_cache)
assert len(inputs) <= 10, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None:
num_choices = shape_list(input_ids)[1]
seq_length = shape_list(input_ids)[2]
else:
num_choices = shape_list(inputs_embeds)[1]
seq_length = shape_list(inputs_embeds)[2]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_input_mask = tf.reshape(input_mask, (-1, seq_length)) if input_mask is not None else None
flat_inputs = [
flat_input_ids,
flat_attention_mask,
mems,
perm_mask,
target_mapping,
flat_token_type_ids,
flat_input_mask,
head_mask,
inputs_embeds,
use_cache,
]
transformer_outputs = self.transformer(flat_inputs, training=training)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
logits = self.logits_proj(logits)
reshaped_logits = tf.reshape(logits, (-1, num_choices))
outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, reshaped_logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (mems), (hidden states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -1002,7 +1190,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -1002,7 +1190,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificationLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels self.num_labels = config.num_labels
...@@ -1012,8 +1200,26 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): ...@@ -1012,8 +1200,26 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`): logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:(batch_size, config.num_labels)`):
...@@ -1041,19 +1247,36 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): ...@@ -1041,19 +1247,36 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased') model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
scores = outputs[0] outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
training=training,
)
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.classifier(output) logits = self.classifier(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, (mems), (hidden states), (attentions) if labels is not None:
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings( @add_start_docstrings(
...@@ -1061,7 +1284,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): ...@@ -1061,7 +1284,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnsweringLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -1070,8 +1293,35 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -1070,8 +1293,35 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
input_ids=None,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
training=False,
):
r""" r"""
start_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
...@@ -1103,12 +1353,27 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -1103,12 +1353,27 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
outputs = model(input_ids) input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = outputs[:2] start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(
input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
training=training,
)
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -1121,7 +1386,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): ...@@ -1121,7 +1386,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
1: 1:
] # Keep mems, hidden states, attentions if there are in it ] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (mems), (hidden_states), (attentions) if start_positions is not None and end_positions is not None:
labels = {"start_position": start_positions}
labels["end_position"] = end_positions
loss = self.compute_loss(labels, outputs[:2])
outputs = (loss,) + outputs
return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
......
...@@ -58,27 +58,41 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): ...@@ -58,27 +58,41 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
} }
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, end_lr=0.0, optimizer_type="adamw"): def create_optimizer(
init_lr,
num_train_steps,
num_warmup_steps,
min_lr_ratio=0.0,
adam_epsilon=1e-8,
weight_decay_rate=0.0,
include_in_weight_decay=None,
):
"""Creates an optimizer with learning rate schedule.""" """Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate. # Implements linear decay of the learning rate.
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay( lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=end_lr, initial_learning_rate=init_lr,
decay_steps=num_train_steps - num_warmup_steps,
end_learning_rate=init_lr * min_lr_ratio,
) )
if num_warmup_steps: if num_warmup_steps:
lr_schedule = WarmUp( lr_schedule = WarmUp(
initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps, initial_learning_rate=init_lr, decay_schedule_fn=lr_schedule, warmup_steps=num_warmup_steps,
) )
if weight_decay_rate > 0.0:
optimizer = AdamWeightDecay( optimizer = AdamWeightDecay(
learning_rate=lr_schedule, learning_rate=lr_schedule,
weight_decay_rate=0.01, weight_decay_rate=weight_decay_rate,
beta_1=0.9, beta_1=0.9,
beta_2=0.999, beta_2=0.999,
epsilon=1e-6, epsilon=adam_epsilon,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
) include_in_weight_decay=include_in_weight_decay,
)
return optimizer else:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=adam_epsilon)
# We return the optimizer and the LR scheduler in order to better track the
# evolution of the LR independently of the optimizer.
return optimizer, lr_schedule
class AdamWeightDecay(tf.keras.optimizers.Adam): class AdamWeightDecay(tf.keras.optimizers.Adam):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment