Unverified Commit 4dc65591 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming...


[Almost all TF models] TF clean up: add missing CLM / MLM loss; fix T5 naming and keras compile (#5395)

* add first version of clm tf

* make style

* add more tests for bert

* update tf clm loss

* fix tests

* correct tf ner script

* add mlm loss

* delete bogus file

* clean tf auto model + add tests

* finish adding clm loss everywhere

* fix training in distilbert

* fix flake8

* save intermediate

* fix tf t5 naming

* remove prints

* finish up

* up

* fix tf gpt2

* fix new test utils import

* fix flake8

* keep backward compatibility

* Update src/transformers/modeling_tf_albert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_electra.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_roberta.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_mobilebert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_auto.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_bert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/modeling_tf_distilbert.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* apply sylvains suggestions
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 33e43edd
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import logging import logging
import os import os
import warnings
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
...@@ -184,7 +185,12 @@ def main(): ...@@ -184,7 +185,12 @@ def main():
for i in range(batch_size): for i in range(batch_size):
for j in range(seq_len): for j in range(seq_len):
if label_ids[i, j] != -1: if label_ids[i, j] == -1:
label_ids[i, j] = -100
warnings.warn(
"Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead."
)
if label_ids[i, j] != -100:
out_label_list[i].append(label_map[label_ids[i][j]]) out_label_list[i].append(label_map[label_ids[i][j]])
preds_list[i].append(label_map[preds[i][j]]) preds_list[i].append(label_map[preds[i][j]])
......
...@@ -453,6 +453,9 @@ if is_tf_available(): ...@@ -453,6 +453,9 @@ if is_tf_available():
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TFAutoModel, TFAutoModel,
TFAutoModelForMultipleChoice, TFAutoModelForMultipleChoice,
TFAutoModelForPreTraining, TFAutoModelForPreTraining,
...@@ -460,6 +463,9 @@ if is_tf_available(): ...@@ -460,6 +463,9 @@ if is_tf_available():
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
TFAutoModelForTokenClassification, TFAutoModelForTokenClassification,
TFAutoModelWithLMHead, TFAutoModelWithLMHead,
TFAutoModelForCausalLM,
TFAutoModelForMaskedLM,
TFAutoModelForSeq2SeqLM,
) )
from .modeling_tf_albert import ( from .modeling_tf_albert import (
...@@ -478,6 +484,7 @@ if is_tf_available(): ...@@ -478,6 +484,7 @@ if is_tf_available():
from .modeling_tf_bert import ( from .modeling_tf_bert import (
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFBertEmbeddings, TFBertEmbeddings,
TFBertLMHeadModel,
TFBertForMaskedLM, TFBertForMaskedLM,
TFBertForMultipleChoice, TFBertForMultipleChoice,
TFBertForNextSentencePrediction, TFBertForNextSentencePrediction,
......
...@@ -73,6 +73,7 @@ from .modeling_bert import ( ...@@ -73,6 +73,7 @@ from .modeling_bert import (
from .modeling_camembert import ( from .modeling_camembert import (
CamembertForMaskedLM, CamembertForMaskedLM,
CamembertForMultipleChoice, CamembertForMultipleChoice,
CamembertForQuestionAnswering,
CamembertForSequenceClassification, CamembertForSequenceClassification,
CamembertForTokenClassification, CamembertForTokenClassification,
CamembertModel, CamembertModel,
...@@ -306,6 +307,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( ...@@ -306,6 +307,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[ [
(DistilBertConfig, DistilBertForQuestionAnswering), (DistilBertConfig, DistilBertForQuestionAnswering),
(AlbertConfig, AlbertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering),
(CamembertConfig, CamembertForQuestionAnswering),
(BartConfig, BartForQuestionAnswering), (BartConfig, BartForQuestionAnswering),
(LongformerConfig, LongformerForQuestionAnswering), (LongformerConfig, LongformerForQuestionAnswering),
(XLMRobertaConfig, XLMRobertaForQuestionAnswering), (XLMRobertaConfig, XLMRobertaForQuestionAnswering),
...@@ -336,7 +338,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( ...@@ -336,7 +338,6 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
] ]
) )
MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[ [
(CamembertConfig, CamembertForMultipleChoice), (CamembertConfig, CamembertForMultipleChoice),
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -822,7 +823,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer): ...@@ -822,7 +823,7 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING) @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -834,8 +835,26 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -834,8 +835,26 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj::obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
...@@ -852,14 +871,35 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -852,14 +871,35 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.albert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.albert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.predictions(sequence_output, training=training)
# Add hidden states and attention if they are here # Add hidden states and attention if they are here
outputs = (prediction_scores,) + outputs[2:] outputs = (prediction_scores,) + outputs[2:]
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # prediction_scores, (hidden_states), (attentions)
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import logging import logging
import warnings
from collections import OrderedDict from collections import OrderedDict
from .configuration_auto import ( from .configuration_auto import (
...@@ -54,6 +55,7 @@ from .modeling_tf_bert import ( ...@@ -54,6 +55,7 @@ from .modeling_tf_bert import (
TFBertForQuestionAnswering, TFBertForQuestionAnswering,
TFBertForSequenceClassification, TFBertForSequenceClassification,
TFBertForTokenClassification, TFBertForTokenClassification,
TFBertLMHeadModel,
TFBertModel, TFBertModel,
) )
from .modeling_tf_camembert import ( from .modeling_tf_camembert import (
...@@ -140,126 +142,158 @@ logger = logging.getLogger(__name__) ...@@ -140,126 +142,158 @@ logger = logging.getLogger(__name__)
TF_MODEL_MAPPING = OrderedDict( TF_MODEL_MAPPING = OrderedDict(
[ [
(T5Config, TFT5Model),
(DistilBertConfig, TFDistilBertModel),
(AlbertConfig, TFAlbertModel), (AlbertConfig, TFAlbertModel),
(CamembertConfig, TFCamembertModel), (CamembertConfig, TFCamembertModel),
(CTRLConfig, TFCTRLModel), (XLMRobertaConfig, TFXLMRobertaModel),
(DistilBertConfig, TFDistilBertModel),
(ElectraConfig, TFElectraModel),
(FlaubertConfig, TFFlaubertModel),
(GPT2Config, TFGPT2Model),
(MobileBertConfig, TFMobileBertModel),
(OpenAIGPTConfig, TFOpenAIGPTModel),
(RobertaConfig, TFRobertaModel), (RobertaConfig, TFRobertaModel),
(BertConfig, TFBertModel), (BertConfig, TFBertModel),
(T5Config, TFT5Model), (OpenAIGPTConfig, TFOpenAIGPTModel),
(GPT2Config, TFGPT2Model),
(MobileBertConfig, TFMobileBertModel),
(TransfoXLConfig, TFTransfoXLModel), (TransfoXLConfig, TFTransfoXLModel),
(XLMConfig, TFXLMModel),
(XLMRobertaConfig, TFXLMRobertaModel),
(XLNetConfig, TFXLNetModel), (XLNetConfig, TFXLNetModel),
(FlaubertConfig, TFFlaubertModel),
(XLMConfig, TFXLMModel),
(CTRLConfig, TFCTRLModel),
(ElectraConfig, TFElectraModel),
] ]
) )
TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
[ [
(T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForPreTraining), (AlbertConfig, TFAlbertForPreTraining),
(CamembertConfig, TFCamembertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM),
(CTRLConfig, TFCTRLLMHeadModel), (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(DistilBertConfig, TFDistilBertForMaskedLM),
(ElectraConfig, TFElectraForPreTraining),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel),
(MobileBertConfig, TFMobileBertForPreTraining),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(RobertaConfig, TFRobertaForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM),
(BertConfig, TFBertForPreTraining), (BertConfig, TFBertForPreTraining),
(T5Config, TFT5ForConditionalGeneration), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel),
(MobileBertConfig, TFMobileBertForPreTraining),
(TransfoXLConfig, TFTransfoXLLMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel),
(XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(XLNetConfig, TFXLNetLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel),
(CTRLConfig, TFCTRLLMHeadModel),
(ElectraConfig, TFElectraForPreTraining),
] ]
) )
TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
[ [
(T5Config, TFT5ForConditionalGeneration),
(DistilBertConfig, TFDistilBertForMaskedLM),
(AlbertConfig, TFAlbertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM),
(CamembertConfig, TFCamembertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM),
(CTRLConfig, TFCTRLLMHeadModel), (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(DistilBertConfig, TFDistilBertForMaskedLM),
(ElectraConfig, TFElectraForMaskedLM),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel),
(MobileBertConfig, TFMobileBertForMaskedLM),
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(RobertaConfig, TFRobertaForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM),
(BertConfig, TFBertForMaskedLM), (BertConfig, TFBertForMaskedLM),
(T5Config, TFT5ForConditionalGeneration), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(GPT2Config, TFGPT2LMHeadModel),
(MobileBertConfig, TFMobileBertForMaskedLM),
(TransfoXLConfig, TFTransfoXLLMHeadModel), (TransfoXLConfig, TFTransfoXLLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel),
(XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(XLNetConfig, TFXLNetLMHeadModel), (XLNetConfig, TFXLNetLMHeadModel),
(FlaubertConfig, TFFlaubertWithLMHeadModel),
(XLMConfig, TFXLMWithLMHeadModel),
(CTRLConfig, TFCTRLLMHeadModel),
(ElectraConfig, TFElectraForMaskedLM),
] ]
) )
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
[ [
(AlbertConfig, TFAlbertForMultipleChoice), (BertConfig, TFBertLMHeadModel),
(CamembertConfig, TFCamembertForMultipleChoice), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
(DistilBertConfig, TFDistilBertForMultipleChoice), (GPT2Config, TFGPT2LMHeadModel),
(FlaubertConfig, TFFlaubertForMultipleChoice), (TransfoXLConfig, TFTransfoXLLMHeadModel),
(MobileBertConfig, TFMobileBertForMultipleChoice), (XLNetConfig, TFXLNetLMHeadModel),
(RobertaConfig, TFRobertaForMultipleChoice), (
(BertConfig, TFBertForMultipleChoice), XLMConfig,
(XLMConfig, TFXLMForMultipleChoice), TFXLMWithLMHeadModel,
(XLMRobertaConfig, TFXLMRobertaForMultipleChoice), ), # XLM can be MLM and CLM => model should be split similar to BERT; leave here for now
(XLNetConfig, TFXLNetForMultipleChoice), (CTRLConfig, TFCTRLLMHeadModel),
] ]
) )
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
[ [
(AlbertConfig, TFAlbertForQuestionAnswering), (DistilBertConfig, TFDistilBertForMaskedLM),
(CamembertConfig, TFCamembertForQuestionAnswering), (AlbertConfig, TFAlbertForMaskedLM),
(DistilBertConfig, TFDistilBertForQuestionAnswering), (CamembertConfig, TFCamembertForMaskedLM),
(ElectraConfig, TFElectraForQuestionAnswering), (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
(FlaubertConfig, TFFlaubertForQuestionAnsweringSimple), (RobertaConfig, TFRobertaForMaskedLM),
(MobileBertConfig, TFMobileBertForQuestionAnswering), (BertConfig, TFBertForMaskedLM),
(RobertaConfig, TFRobertaForQuestionAnswering), (MobileBertConfig, TFMobileBertForMaskedLM),
(BertConfig, TFBertForQuestionAnswering), (FlaubertConfig, TFFlaubertWithLMHeadModel),
(XLMConfig, TFXLMForQuestionAnsweringSimple), (XLMConfig, TFXLMWithLMHeadModel),
(XLMRobertaConfig, TFXLMRobertaForQuestionAnswering), (ElectraConfig, TFElectraForMaskedLM),
(XLNetConfig, TFXLNetForQuestionAnsweringSimple),
] ]
) )
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict([(T5Config, TFT5ForConditionalGeneration)])
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
[ [
(DistilBertConfig, TFDistilBertForSequenceClassification),
(AlbertConfig, TFAlbertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification),
(CamembertConfig, TFCamembertForSequenceClassification), (CamembertConfig, TFCamembertForSequenceClassification),
(DistilBertConfig, TFDistilBertForSequenceClassification), (XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
(FlaubertConfig, TFFlaubertForSequenceClassification),
(MobileBertConfig, TFMobileBertForSequenceClassification),
(RobertaConfig, TFRobertaForSequenceClassification), (RobertaConfig, TFRobertaForSequenceClassification),
(BertConfig, TFBertForSequenceClassification), (BertConfig, TFBertForSequenceClassification),
(XLMConfig, TFXLMForSequenceClassification),
(XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
(XLNetConfig, TFXLNetForSequenceClassification), (XLNetConfig, TFXLNetForSequenceClassification),
(MobileBertConfig, TFMobileBertForSequenceClassification),
(FlaubertConfig, TFFlaubertForSequenceClassification),
(XLMConfig, TFXLMForSequenceClassification),
]
)
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
[
(DistilBertConfig, TFDistilBertForQuestionAnswering),
(AlbertConfig, TFAlbertForQuestionAnswering),
(CamembertConfig, TFCamembertForQuestionAnswering),
(XLMRobertaConfig, TFXLMRobertaForQuestionAnswering),
(RobertaConfig, TFRobertaForQuestionAnswering),
(BertConfig, TFBertForQuestionAnswering),
(XLNetConfig, TFXLNetForQuestionAnsweringSimple),
(MobileBertConfig, TFMobileBertForQuestionAnswering),
(FlaubertConfig, TFFlaubertForQuestionAnsweringSimple),
(XLMConfig, TFXLMForQuestionAnsweringSimple),
(ElectraConfig, TFElectraForQuestionAnswering),
] ]
) )
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
[ [
(DistilBertConfig, TFDistilBertForTokenClassification),
(AlbertConfig, TFAlbertForTokenClassification), (AlbertConfig, TFAlbertForTokenClassification),
(CamembertConfig, TFCamembertForTokenClassification), (CamembertConfig, TFCamembertForTokenClassification),
(DistilBertConfig, TFDistilBertForTokenClassification),
(ElectraConfig, TFElectraForTokenClassification),
(FlaubertConfig, TFFlaubertForTokenClassification), (FlaubertConfig, TFFlaubertForTokenClassification),
(MobileBertConfig, TFMobileBertForTokenClassification),
(RobertaConfig, TFRobertaForTokenClassification),
(BertConfig, TFBertForTokenClassification),
(XLMConfig, TFXLMForTokenClassification), (XLMConfig, TFXLMForTokenClassification),
(XLMRobertaConfig, TFXLMRobertaForTokenClassification), (XLMRobertaConfig, TFXLMRobertaForTokenClassification),
(RobertaConfig, TFRobertaForTokenClassification),
(BertConfig, TFBertForTokenClassification),
(MobileBertConfig, TFMobileBertForTokenClassification),
(XLNetConfig, TFXLNetForTokenClassification), (XLNetConfig, TFXLNetForTokenClassification),
(ElectraConfig, TFElectraForTokenClassification),
]
)
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
[
(CamembertConfig, TFCamembertForMultipleChoice),
(XLMConfig, TFXLMForMultipleChoice),
(XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
(RobertaConfig, TFRobertaForMultipleChoice),
(BertConfig, TFBertForMultipleChoice),
(DistilBertConfig, TFDistilBertForMultipleChoice),
(MobileBertConfig, TFMobileBertForMultipleChoice),
(XLNetConfig, TFXLNetForMultipleChoice),
(FlaubertConfig, TFFlaubertForMultipleChoice),
(AlbertConfig, TFAlbertForMultipleChoice),
] ]
) )
...@@ -303,11 +337,11 @@ class TFAutoModel(object): ...@@ -303,11 +337,11 @@ class TFAutoModel(object):
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
...@@ -359,7 +393,7 @@ class TFAutoModel(object): ...@@ -359,7 +393,7 @@ class TFAutoModel(object):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
from_pt: (`Optional`) Boolean from_pt: (`Optional`) Boolean
...@@ -368,17 +402,17 @@ class TFAutoModel(object): ...@@ -368,17 +402,17 @@ class TFAutoModel(object):
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -401,7 +435,7 @@ class TFAutoModel(object): ...@@ -401,7 +435,7 @@ class TFAutoModel(object):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
...@@ -452,11 +486,11 @@ class TFAutoModelForPreTraining(object): ...@@ -452,11 +486,11 @@ class TFAutoModelForPreTraining(object):
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config (:class:`~transformers.PretrainedConfig`): config (:class:`~transformers.TFPretrainedConfig`):
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertModelForMaskedLM` (DistilBERT model) - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertModelForMaskedLM` (DistilBERT model)
...@@ -478,7 +512,7 @@ class TFAutoModelForPreTraining(object): ...@@ -478,7 +512,7 @@ class TFAutoModelForPreTraining(object):
if isinstance(config, config_class): if isinstance(config, config_class):
return model_class(config) return model_class(config)
raise ValueError( raise ValueError(
"Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format( "Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
) )
...@@ -513,21 +547,21 @@ class TFAutoModelForPreTraining(object): ...@@ -513,21 +547,21 @@ class TFAutoModelForPreTraining(object):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used. configuration should be cached if the standard cache should not be used.
...@@ -549,7 +583,7 @@ class TFAutoModelForPreTraining(object): ...@@ -549,7 +583,7 @@ class TFAutoModelForPreTraining(object):
underlying model's ``__init__`` method (we assume all relevant updates to the configuration have underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
already been done) already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of
``kwargs`` that corresponds to a configuration attribute will be used to override said attribute ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
attribute will be passed to the underlying model's ``__init__`` function. attribute will be passed to the underlying model's ``__init__`` function.
...@@ -573,7 +607,7 @@ class TFAutoModelForPreTraining(object): ...@@ -573,7 +607,7 @@ class TFAutoModelForPreTraining(object):
if isinstance(config, config_class): if isinstance(config, config_class):
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
raise ValueError( raise ValueError(
"Unrecognized configuration class {} for this kind of AutoModel: {}.\n" "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format( "Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
) )
...@@ -619,28 +653,32 @@ class TFAutoModelWithLMHead(object): ...@@ -619,28 +653,32 @@ class TFAutoModelWithLMHead(object):
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
- isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model)
- isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `bert` configuration class: TFBertModel (Bert model)
- isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
- isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model)
- isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL model)
- isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
- isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model)
- isInstance of `xlm` configuration class: XLMModel (XLM model) - isInstance of `xlm` configuration class: TFXLMModel (XLM model)
Examples:: Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = TFAutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
""" """
warnings.warn(
"The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
FutureWarning,
)
for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
if isinstance(config, config_class): if isinstance(config, config_class):
return model_class(config) return model_class(config)
...@@ -676,7 +714,7 @@ class TFAutoModelWithLMHead(object): ...@@ -676,7 +714,7 @@ class TFAutoModelWithLMHead(object):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
from_pt: (`Optional`) Boolean from_pt: (`Optional`) Boolean
...@@ -685,17 +723,17 @@ class TFAutoModelWithLMHead(object): ...@@ -685,17 +723,17 @@ class TFAutoModelWithLMHead(object):
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -718,7 +756,7 @@ class TFAutoModelWithLMHead(object): ...@@ -718,7 +756,7 @@ class TFAutoModelWithLMHead(object):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
...@@ -731,6 +769,10 @@ class TFAutoModelWithLMHead(object): ...@@ -731,6 +769,10 @@ class TFAutoModelWithLMHead(object):
model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
""" """
warnings.warn(
"The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
FutureWarning,
)
config = kwargs.pop("config", None) config = kwargs.pop("config", None)
if not isinstance(config, PretrainedConfig): if not isinstance(config, PretrainedConfig):
...@@ -778,19 +820,19 @@ class TFAutoModelForMultipleChoice: ...@@ -778,19 +820,19 @@ class TFAutoModelForMultipleChoice:
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `albert` configuration class: AlbertModel (Albert model) - isInstance of `albert` configuration class: TFAlbertModel (Albert model)
- isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `bert` configuration class: TFBertModel (Bert model)
Examples:: Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
""" """
for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
if isinstance(config, config_class): if isinstance(config, config_class):
...@@ -824,7 +866,7 @@ class TFAutoModelForMultipleChoice: ...@@ -824,7 +866,7 @@ class TFAutoModelForMultipleChoice:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
from_pt: (`Optional`) Boolean from_pt: (`Optional`) Boolean
...@@ -833,17 +875,17 @@ class TFAutoModelForMultipleChoice: ...@@ -833,17 +875,17 @@ class TFAutoModelForMultipleChoice:
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -866,7 +908,7 @@ class TFAutoModelForMultipleChoice: ...@@ -866,7 +908,7 @@ class TFAutoModelForMultipleChoice:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
...@@ -896,6 +938,406 @@ class TFAutoModelForMultipleChoice: ...@@ -896,6 +938,406 @@ class TFAutoModelForMultipleChoice:
) )
class TFAutoModelForCausalLM:
r"""
:class:`~transformers.TFAutoModelForCausalLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
class method.
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError(
"TFAutoModelForCausalLM is designed to be instantiated "
"using the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelForCausalLM.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
r""" Instantiates one of the base model classes of the library
from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights
Args:
config (:class:`~transformers.TFPretrainedConfig`):
The model class to instantiate is selected based on the configuration class:
- isInstance of `bert` configuration class: :class:`~transformers.TFBertLMHeadModel` (Bert model)
- isInstance of `openai-gpt` configuration class: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
- isInstance of `gpt2` configuration class: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model)
- isInstance of `ctrl` configuration class: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model)
- isInstance of `transfo-xl` configuration class: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
- isInstance of `xlnet` configuration class: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
Examples::
config = GPT2Config.from_pretrained('gpt2') # Download configuration from S3 and cache.
model = TFAutoModelForCausalLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class(config)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
)
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model)
- `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
- `gpt2`: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model)
- `transfo-xl`: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
- `xlnet`: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
- `ctrl`: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Args:
pretrained_model_name_or_path:
Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
These arguments will be passed to the configuration and the model.
Examples::
model = TFAutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
model = TFAutoModelForCausalLM.from_pretrained('./test/gpt2_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/gpt2_tf_model_config.json')
model = TFAutoModelForCausalLM.from_pretrained('./tf_model/gpt2_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
config = kwargs.pop("config", None)
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
)
)
class TFAutoModelForMaskedLM:
r"""
:class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
class method.
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError(
"TFAutoModelForMaskedLM is designed to be instantiated "
"using the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelForMaskedLM.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
r""" Instantiates one of the base model classes of the library
from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights
Args:
config (:class:`~transformers.TFPretrainedConfig`):
The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
- isInstance of `roberta` configuration class: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
- isInstance of `bert` configuration class: :class:`~transformers.TFBertForMaskedLM` (Bert model)
- isInstance of `flaubert` configuration class: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model)
- isInstance of `xlm` configuration class: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
- isInstance of `xlm-roberta` configuration class: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-Roberta model)
- isInstance of `electra` configuration class: :class:`~transformers.TFElectraForMaskedLM` (Electra model)
- isInstance of `camembert` configuration class: :class:`~transformers.TFCamembertForMaskedLM` (Camembert model)
- isInstance of `albert` configuration class: :class:`~transformers.TFAlbertForMaskedLM` (Albert model)
Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = TFAutoModelForMaskedLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class(config)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys())
)
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
- `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model)
- `camembert`: :class:`~transformers.TFCamembertForMaskedLM` (CamemBERT model)
- `xlm-roberta`: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-RoBERTa model)
- `longformer`: :class:`~transformers.TFLongformerForMaskedLM` (Longformer model)
- `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
- `xlm`: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
- `flaubert`: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model)
- `electra`: :class:`~transformers.TFElectraForMaskedLM` (Electra model)
- `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Args:
pretrained_model_name_or_path:
Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
These arguments will be passed to the configuration and the model.
Examples::
model = TFAutoModelForMaskedLM.from_pretrained('bert') # Download model and configuration from S3 and cache.
model = TFAutoModelForMaskedLM.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = TFAutoModelForMaskedLM.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
config = kwargs.pop("config", None)
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys())
)
)
class TFAutoModelForSeq2SeqLM:
r"""
:class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
class method.
This class cannot be instantiated using `__init__()` (throws an error).
"""
def __init__(self):
raise EnvironmentError(
"TFAutoModelForSeq2SeqLM is designed to be instantiated "
"using the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)` or "
"`TFAutoModelForSeq2SeqLM.from_config(config)` methods."
)
@classmethod
def from_config(cls, config):
r""" Instantiates one of the base model classes of the library
from a configuration.
Note:
Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights
Args:
config (:class:`~transformers.TFPretrainedConfig`):
The model class to instantiate is selected based on the configuration class:
- isInstance of `t5` configuration class: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model)
Examples::
config = T5Config.from_pretrained('t5')
model = TFAutoModelForSeq2SeqLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class(config)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__,
cls.__name__,
", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()),
)
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `t5`: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
Args:
pretrained_model_name_or_path:
Either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
output_loading_info: (`optional`) boolean:
Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
These arguments will be passed to the configuration and the model.
Examples::
model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-base') # Download model and configuration from S3 and cache.
model = TFAutoModelForSeq2SeqLM.from_pretrained('./test/t5_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/t5_tf_model_config.json')
model = TFAutoModelForSeq2SeqLM.from_pretrained('./tf_model/t5_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
config = kwargs.pop("config", None)
if not isinstance(config, PretrainedConfig):
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items():
if isinstance(config, config_class):
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
raise ValueError(
"Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
"Model type should be one of {}.".format(
config.__class__,
cls.__name__,
", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()),
)
)
class TFAutoModelForSequenceClassification(object): class TFAutoModelForSequenceClassification(object):
r""" r"""
:class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
...@@ -930,11 +1372,11 @@ class TFAutoModelForSequenceClassification(object): ...@@ -930,11 +1372,11 @@ class TFAutoModelForSequenceClassification(object):
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
...@@ -946,7 +1388,7 @@ class TFAutoModelForSequenceClassification(object): ...@@ -946,7 +1388,7 @@ class TFAutoModelForSequenceClassification(object):
Examples:: Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` model = TFAutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
""" """
for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
if isinstance(config, config_class): if isinstance(config, config_class):
...@@ -983,7 +1425,7 @@ class TFAutoModelForSequenceClassification(object): ...@@ -983,7 +1425,7 @@ class TFAutoModelForSequenceClassification(object):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
from_pt: (`Optional`) Boolean from_pt: (`Optional`) Boolean
...@@ -992,17 +1434,17 @@ class TFAutoModelForSequenceClassification(object): ...@@ -992,17 +1434,17 @@ class TFAutoModelForSequenceClassification(object):
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -1025,7 +1467,7 @@ class TFAutoModelForSequenceClassification(object): ...@@ -1025,7 +1467,7 @@ class TFAutoModelForSequenceClassification(object):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
...@@ -1090,11 +1532,11 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1090,11 +1532,11 @@ class TFAutoModelForQuestionAnswering(object):
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
...@@ -1145,7 +1587,7 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1145,7 +1587,7 @@ class TFAutoModelForQuestionAnswering(object):
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
from_pt: (`Optional`) Boolean from_pt: (`Optional`) Boolean
...@@ -1154,17 +1596,17 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1154,17 +1596,17 @@ class TFAutoModelForQuestionAnswering(object):
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -1187,7 +1629,7 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1187,7 +1629,7 @@ class TFAutoModelForQuestionAnswering(object):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
...@@ -1222,7 +1664,7 @@ class TFAutoModelForTokenClassification: ...@@ -1222,7 +1664,7 @@ class TFAutoModelForTokenClassification:
raise EnvironmentError( raise EnvironmentError(
"TFAutoModelForTokenClassification is designed to be instantiated " "TFAutoModelForTokenClassification is designed to be instantiated "
"using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
"`AutoModelForTokenClassification.from_config(config)` methods." "`TFAutoModelForTokenClassification.from_config(config)` methods."
) )
@classmethod @classmethod
...@@ -1232,11 +1674,11 @@ class TFAutoModelForTokenClassification: ...@@ -1232,11 +1674,11 @@ class TFAutoModelForTokenClassification:
Note: Note:
Loading a model from its configuration file does **not** load the model weights. Loading a model from its configuration file does **not** load the model weights.
It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
the model weights the model weights
Args: Args:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
The model class to instantiate is selected based on the configuration class: The model class to instantiate is selected based on the configuration class:
- isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `bert` configuration class: BertModel (Bert model)
...@@ -1282,23 +1724,23 @@ class TFAutoModelForTokenClassification: ...@@ -1282,23 +1724,23 @@ class TFAutoModelForTokenClassification:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
model_args: (`optional`) Sequence of positional arguments: model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict: state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights. This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded pre-trained model
...@@ -1318,7 +1760,7 @@ class TFAutoModelForTokenClassification: ...@@ -1318,7 +1760,7 @@ class TFAutoModelForTokenClassification:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples:: Examples::
......
...@@ -29,6 +29,8 @@ from .file_utils import ( ...@@ -29,6 +29,8 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -803,9 +805,12 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -803,9 +805,12 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
class TFBertForMaskedLM(TFBertPreTrainedModel): class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
assert (
not config.is_decoder
), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
...@@ -815,8 +820,26 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -815,8 +820,26 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -833,13 +856,113 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -833,13 +856,113 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.bert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.bert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
assert config.is_decoder, "If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`"
self.bert = TFBertMainLayer(config, name="bert")
self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
def get_output_embeddings(self):
return self.bert.embeddings
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.bert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0]
logits = self.mlm(sequence_output, training=training)
outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # prediction_scores, (hidden_states), (attentions)
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFSharedEmbeddings, TFSharedEmbeddings,
cast_bool_to_primitive, cast_bool_to_primitive,
...@@ -542,7 +543,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): ...@@ -542,7 +543,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
CTRL_START_DOCSTRING, CTRL_START_DOCSTRING,
) )
class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
...@@ -561,8 +562,26 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -561,8 +562,26 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs): def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -583,11 +602,37 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -583,11 +602,37 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels
if len(inputs) > 10:
inputs = inputs[:10]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.lm_head(hidden_states) logits = self.lm_head(hidden_states)
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (logits,) + transformer_outputs[1:]
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) return outputs # lm_logits, presents, (all hidden_states), (attentions)
...@@ -30,6 +30,7 @@ from .file_utils import ( ...@@ -30,6 +30,7 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -116,7 +117,7 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -116,7 +117,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
mode: string, a valid value is one of "embedding" and "linear". mode: string, a valid value is one of "embedding" and "linear".
Returns: Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with outputs: (1) If mode == "embedding", output embedding tensor, float32 with
...@@ -528,9 +529,9 @@ DISTILBERT_START_DOCSTRING = r""" ...@@ -528,9 +529,9 @@ DISTILBERT_START_DOCSTRING = r"""
- a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
:obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` :obj:`model([input_ids, attention_mask])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring: - a dictionary with one or several input Tensors associated to the input names given in the docstring:
:obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` :obj:`model({'input_ids': input_ids})`
Parameters: Parameters:
config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
...@@ -626,7 +627,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -626,7 +627,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
...@@ -644,8 +645,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -644,8 +645,23 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
...@@ -663,7 +679,22 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -663,7 +679,22 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
distilbert_output = self.distilbert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
if len(inputs) > 6:
inputs = inputs[:6]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
distilbert_output = self.distilbert(
inputs,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = distilbert_output[0] # (bs, seq_length, dim) hidden_states = distilbert_output[0] # (bs, seq_length, dim)
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
...@@ -672,6 +703,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -672,6 +703,11 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
prediction_logits = self.vocab_projector(prediction_logits) prediction_logits = self.vocab_projector(prediction_logits)
outputs = (prediction_logits,) + distilbert_output[1:] outputs = (prediction_logits,) + distilbert_output[1:]
if labels is not None:
loss = self.compute_loss(labels, prediction_logits)
outputs = (loss,) + outputs
return outputs # logits, (hidden_states), (attentions) return outputs # logits, (hidden_states), (attentions)
......
...@@ -7,6 +7,7 @@ from transformers import ElectraConfig ...@@ -7,6 +7,7 @@ from transformers import ElectraConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
TFTokenClassificationLoss, TFTokenClassificationLoss,
get_initializer, get_initializer,
...@@ -506,7 +507,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer): ...@@ -506,7 +507,7 @@ class TFElectraMaskedLMHead(tf.keras.layers.Layer):
the only model of the two to have been trained for the masked language modeling task.""", the only model of the two to have been trained for the masked language modeling task.""",
ELECTRA_START_DOCSTRING, ELECTRA_START_DOCSTRING,
) )
class TFElectraForMaskedLM(TFElectraPreTrainedModel): class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(config, **kwargs) super().__init__(config, **kwargs)
...@@ -534,9 +535,16 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -534,9 +535,16 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
inputs_embeds=None, inputs_embeds=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -553,6 +561,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -553,6 +561,12 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if isinstance(input_ids, (tuple, list)):
labels = input_ids[8] if len(input_ids) > 8 else labels
if len(input_ids) > 8:
input_ids = input_ids[:8]
elif isinstance(input_ids, (dict, BatchEncoding)):
labels = input_ids.pop("labels", labels)
generator_hidden_states = self.electra( generator_hidden_states = self.electra(
input_ids, input_ids,
...@@ -571,6 +585,10 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -571,6 +585,10 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
output = (prediction_scores,) output = (prediction_scores,)
output += generator_hidden_states[1:] output += generator_hidden_states[1:]
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
output = (loss,) + output
return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceSummary, TFSequenceSummary,
...@@ -272,8 +273,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -272,8 +273,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
use_cache = inputs[7] if len(inputs) > 7 else use_cache use_cache = inputs[7] if len(inputs) > 7 else use_cache
output_attentions = inputs[8] if len(inputs) > 7 else output_attentions output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs." assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
...@@ -524,7 +525,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -524,7 +525,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
GPT2_START_DOCSTRING, GPT2_START_DOCSTRING,
) )
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
...@@ -541,8 +542,26 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -541,8 +542,26 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs): def call(
self,
inputs,
past=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -563,12 +582,38 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -563,12 +582,38 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[10] if len(inputs) > 10 else labels
if len(inputs) > 10:
inputs = inputs[:10]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
past=past,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.transformer.wte(hidden_states, mode="linear") logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (lm_logits,) + transformer_outputs[1:] outputs = (logits,) + transformer_outputs[1:]
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, presents, (all hidden_states), (attentions) return outputs # lm_logits, presents, (all hidden_states), (attentions)
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -929,7 +930,7 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -929,7 +930,7 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
@add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -941,8 +942,25 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -941,8 +942,25 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -959,14 +977,34 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -959,14 +977,34 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.mobilebert(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.mobilebert(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) prediction_scores = self.mlm(sequence_output, training=training)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # prediction_scores, (hidden_states), (attentions) return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
......
...@@ -24,6 +24,7 @@ import tensorflow as tf ...@@ -24,6 +24,7 @@ import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
TFSequenceSummary, TFSequenceSummary,
...@@ -479,7 +480,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -479,7 +480,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_START_DOCSTRING,
) )
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
...@@ -489,8 +490,24 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -489,8 +490,24 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -507,12 +524,35 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -507,12 +524,35 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (logits,) + transformer_outputs[1:]
outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # lm_logits, (all hidden_states), (attentions) return outputs # lm_logits, (all hidden_states), (attentions)
......
...@@ -29,6 +29,7 @@ from .file_utils import ( ...@@ -29,6 +29,7 @@ from .file_utils import (
) )
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMaskedLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -264,7 +265,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ...@@ -264,7 +265,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
...@@ -276,8 +277,26 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -276,8 +277,26 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs): def call(
self,
inputs=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -294,14 +313,37 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -294,14 +313,37 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
outputs = self.roberta(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
if len(inputs) > 8:
inputs = inputs[:8]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
outputs = self.roberta(
inputs,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
training=training,
)
sequence_output = outputs[0]
sequence_output = outputs[0] sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output) prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
return outputs # prediction_scores, (hidden_states), (attentions) if labels is not None:
loss = self.compute_loss(labels, prediction_scores)
outputs = (loss,) + outputs
return outputs # (loss), prediction_scores, (hidden_states), (attentions)
class TFRobertaClassificationHead(tf.keras.layers.Layer): class TFRobertaClassificationHead(tf.keras.layers.Layer):
......
...@@ -20,12 +20,14 @@ import copy ...@@ -20,12 +20,14 @@ import copy
import itertools import itertools
import logging import logging
import math import math
import warnings
import tensorflow as tf import tensorflow as tf
from .configuration_t5 import T5Config from .configuration_t5 import T5Config
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFSharedEmbeddings, TFSharedEmbeddings,
cast_bool_to_primitive, cast_bool_to_primitive,
...@@ -111,6 +113,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -111,6 +113,7 @@ class TFT5Attention(tf.keras.layers.Layer):
super().__init__(**kwargs) super().__init__(**kwargs)
self.layer_id = next(TFT5Attention.NEW_ID) self.layer_id = next(TFT5Attention.NEW_ID)
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.use_cache = config.use_cache
self.has_relative_attention_bias = has_relative_attention_bias self.has_relative_attention_bias = has_relative_attention_bias
self.relative_attention_num_buckets = config.relative_attention_num_buckets self.relative_attention_num_buckets = config.relative_attention_num_buckets
...@@ -258,9 +261,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -258,9 +261,7 @@ class TFT5Attention(tf.keras.layers.Layer):
k, v = past_key_value_state k, v = past_key_value_state
# to cope with keras serialization # to cope with keras serialization
use_cache = cast_bool_to_primitive(use_cache) if self.is_decoder and cast_bool_to_primitive(use_cache, self.use_cache) is True:
if self.is_decoder and use_cache is True:
present_key_value_state = ((k, v),) present_key_value_state = ((k, v),)
else: else:
present_key_value_state = (None,) present_key_value_state = (None,)
...@@ -295,7 +296,7 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -295,7 +296,7 @@ class TFT5Attention(tf.keras.layers.Layer):
outputs = (context,) + present_key_value_state outputs = (context,) + present_key_value_state
if cast_bool_to_primitive(output_attentions) is True: if cast_bool_to_primitive(output_attentions, True) is True:
outputs = outputs + (weights,) outputs = outputs + (weights,)
if self.has_relative_attention_bias: if self.has_relative_attention_bias:
outputs = outputs + (position_bias,) outputs = outputs + (position_bias,)
...@@ -572,18 +573,22 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -572,18 +573,22 @@ class TFT5MainLayer(tf.keras.layers.Layer):
inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds
head_mask = inputs[5] if len(inputs) > 5 else head_mask head_mask = inputs[5] if len(inputs) > 5 else head_mask
past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions use_cache = inputs[7] if len(inputs) > 7 else use_cache
assert len(inputs) <= 8, "Too many inputs." output_attentions = inputs[8] if len(inputs) > 7 else output_attentions
output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states
assert len(inputs) <= 10, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("decoder_input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("decoder_attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states) encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states)
encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask) encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
past_key_value_states = inputs.get("past_key_value_states", past_key_value_states) past_key_value_states = inputs.get("past_key_value_states", past_key_value_states)
use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 8, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 10, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
...@@ -733,8 +738,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -733,8 +738,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,) outputs = (hidden_states,)
if use_cache is True: # need to check if is decoder here as well for special cases when using keras compile
assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) if cast_bool_to_primitive(use_cache, self.use_cache) is True and self.is_decoder:
outputs = outputs + (present_key_value_states,) outputs = outputs + (present_key_value_states,)
if cast_bool_to_primitive(output_hidden_states) is True: if cast_bool_to_primitive(output_hidden_states) is True:
outputs = outputs + (all_hidden_states,) outputs = outputs + (all_hidden_states,)
...@@ -763,12 +768,38 @@ class TFT5PreTrainedModel(TFPreTrainedModel): ...@@ -763,12 +768,38 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
inputs = tf.constant(DUMMY_INPUTS) inputs = tf.constant(DUMMY_INPUTS)
input_mask = tf.constant(DUMMY_MASK) input_mask = tf.constant(DUMMY_MASK)
dummy_inputs = { dummy_inputs = {
"inputs": inputs, "input_ids": inputs,
"decoder_input_ids": inputs, "decoder_input_ids": inputs,
"decoder_attention_mask": input_mask, "decoder_attention_mask": input_mask,
} }
return dummy_inputs return dummy_inputs
def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id
assert (
decoder_start_token_id is not None
), "self.model.config.decoder_start_token_id has to be defined. In TF T5 it is usually set to the pad_token_id. See T5 docs for more information"
# shift inputs to the right
shifted_input_ids = tf.zeros_like(input_ids, dtype=tf.int32)
shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1)
start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id)
shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1)
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids = tf.where(
shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
)
assert tf.math.reduce_any(
shifted_input_ids >= 0
).numpy(), "Verify that `labels` has only positive values and -100"
return shifted_input_ids
T5_START_DOCSTRING = r""" T5_START_DOCSTRING = r"""
The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
...@@ -900,7 +931,22 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -900,7 +931,22 @@ class TFT5Model(TFT5PreTrainedModel):
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
encoder_outputs=None,
inputs_embeds=None,
head_mask=None,
decoder_past_key_value_states=None,
decoder_input_ids=None,
decoder_attention_mask=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
training=False,
):
r""" r"""
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
...@@ -934,37 +980,58 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -934,37 +980,58 @@ class TFT5Model(TFT5PreTrainedModel):
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
if isinstance(inputs, (tuple, list)):
if isinstance(inputs, dict): input_ids = inputs[0]
kwargs.update(inputs) attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
head_mask = inputs[4] if len(inputs) > 4 else head_mask
decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states
decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids
decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask
decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds
use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
assert len(inputs) <= 12, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
input_ids = inputs.get("inputs")
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
encoder_outputs = inputs.get("encoder_outputs", encoder_outputs)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
head_mask = inputs.get("head_mask", head_mask)
decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states)
decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids)
decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask)
decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds)
use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
assert len(inputs) <= 12, "Too many inputs."
else: else:
kwargs["inputs"] = inputs input_ids = inputs
# retrieve arguments
inputs = kwargs.get("inputs", None)
inputs_embeds = kwargs.get("inputs_embeds", None)
attention_mask = kwargs.get("attention_mask", None)
encoder_outputs = kwargs.get("encoder_outputs", None)
decoder_input_ids = kwargs.get("decoder_input_ids", None)
decoder_attention_mask = kwargs.get("decoder_attention_mask", None)
decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None)
decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None)
use_cache = kwargs.get("use_cache", None)
head_mask = kwargs.get("head_mask", None)
output_attentions = kwargs.get("output_attentions", None)
output_hidden_states = kwargs.get("output_hidden_states", None)
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
inputs, [
attention_mask=attention_mask, input_ids,
inputs_embeds=inputs_embeds, attention_mask,
head_mask=head_mask, None,
output_attentions=output_attentions, None,
output_hidden_states=output_hidden_states, inputs_embeds,
head_mask,
None,
False,
output_attentions,
output_hidden_states,
],
training=training,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
...@@ -979,19 +1046,22 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -979,19 +1046,22 @@ class TFT5Model(TFT5PreTrainedModel):
# Decode # Decode
decoder_outputs = self.decoder( decoder_outputs = self.decoder(
decoder_input_ids, [
attention_mask=decoder_attention_mask, decoder_input_ids,
inputs_embeds=decoder_inputs_embeds, decoder_attention_mask,
past_key_value_states=decoder_past_key_value_states, hidden_states,
encoder_hidden_states=hidden_states, attention_mask,
encoder_attention_mask=attention_mask, decoder_inputs_embeds,
head_mask=head_mask, head_mask,
use_cache=use_cache, decoder_past_key_value_states,
output_attentions=output_attentions, use_cache,
output_hidden_states=output_hidden_states, output_attentions,
output_hidden_states,
],
training=training,
) )
if use_cache is True: if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
past = ((encoder_outputs, decoder_outputs[1]),) past = ((encoder_outputs, decoder_outputs[1]),)
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
...@@ -999,7 +1069,7 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -999,7 +1069,7 @@ class TFT5Model(TFT5PreTrainedModel):
@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
class TFT5ForConditionalGeneration(TFT5PreTrainedModel): class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.model_dim = config.d_model self.model_dim = config.d_model
...@@ -1042,8 +1112,28 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): ...@@ -1042,8 +1112,28 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
return self.decoder return self.decoder
@add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
encoder_outputs=None,
inputs_embeds=None,
head_mask=None,
decoder_past_key_value_states=None,
decoder_input_ids=None,
decoder_attention_mask=None,
decoder_inputs_embeds=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Returns: Returns:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
...@@ -1080,25 +1170,41 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): ...@@ -1080,25 +1170,41 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
>>> result = model.generate(inputs) >>> result = model.generate(inputs)
""" """
if isinstance(inputs, (tuple, list)):
if isinstance(inputs, dict): input_ids = inputs[0]
kwargs.update(inputs) attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs
inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
head_mask = inputs[4] if len(inputs) > 4 else head_mask
decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states
decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids
decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask
decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds
use_cache = inputs[9] if len(inputs) > 9 else use_cache
output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
labels = inputs[12] if len(inputs) > 12 else labels
assert len(inputs) <= 13, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
if "inputs" in inputs:
warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
input_ids = inputs.get("inputs")
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
encoder_outputs = inputs.get("encoder_outputs", encoder_outputs)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
head_mask = inputs.get("head_mask", head_mask)
decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states)
decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids)
decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask)
decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds)
use_cache = inputs.get("use_cache", use_cache)
output_attentions = inputs.get("output_attentions", output_attentions)
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
labels = inputs.get("labels", labels)
assert len(inputs) <= 13, "Too many inputs."
else: else:
kwargs["inputs"] = inputs input_ids = inputs
# retrieve arguments
inputs = kwargs.get("inputs", None)
decoder_input_ids = kwargs.get("decoder_input_ids", None)
attention_mask = kwargs.get("attention_mask", None)
encoder_outputs = kwargs.get("encoder_outputs", None)
decoder_attention_mask = kwargs.get("decoder_attention_mask", None)
decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None)
use_cache = kwargs.get("use_cache", None)
inputs_embeds = kwargs.get("inputs_embeds", None)
decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None)
head_mask = kwargs.get("head_mask", None)
output_attentions = kwargs.get("output_attentions", None)
output_hidden_states = kwargs.get("output_hidden_states", None)
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
...@@ -1106,16 +1212,27 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): ...@@ -1106,16 +1212,27 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
if encoder_outputs is None: if encoder_outputs is None:
# Convert encoder inputs in embeddings if needed # Convert encoder inputs in embeddings if needed
encoder_outputs = self.encoder( encoder_outputs = self.encoder(
inputs, [
attention_mask=attention_mask, input_ids,
inputs_embeds=inputs_embeds, attention_mask,
head_mask=head_mask, None,
output_attentions=output_attentions, None,
output_hidden_states=output_hidden_states, inputs_embeds,
head_mask,
None,
False,
output_attentions,
output_hidden_states,
],
training=training,
) )
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
# get decoder inputs from shifting lm labels to the right
decoder_input_ids = self._shift_right(labels)
# If decoding with past key value states, only the last tokens # If decoding with past key value states, only the last tokens
# should be given as an input # should be given as an input
if decoder_past_key_value_states is not None: if decoder_past_key_value_states is not None:
...@@ -1126,28 +1243,35 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): ...@@ -1126,28 +1243,35 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
# Decode # Decode
decoder_outputs = self.decoder( decoder_outputs = self.decoder(
decoder_input_ids, [
attention_mask=decoder_attention_mask, decoder_input_ids,
inputs_embeds=decoder_inputs_embeds, decoder_attention_mask,
past_key_value_states=decoder_past_key_value_states, hidden_states,
encoder_hidden_states=hidden_states, attention_mask,
encoder_attention_mask=attention_mask, decoder_inputs_embeds,
head_mask=head_mask, head_mask,
use_cache=use_cache, decoder_past_key_value_states,
output_attentions=output_attentions, use_cache,
output_hidden_states=output_hidden_states, output_attentions,
output_hidden_states,
],
training=training,
) )
# insert decoder past at right place # insert decoder past at right place
# to speed up decoding # to speed up decoding
if use_cache is True: if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
past = ((encoder_outputs, decoder_outputs[1]),) past = ((encoder_outputs, decoder_outputs[1]),)
decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
embed_tokens = self.get_output_embeddings() embed_tokens = self.get_output_embeddings()
lm_logits = embed_tokens(sequence_output, mode="linear") logits = embed_tokens(sequence_output, mode="linear")
decoder_outputs = (lm_logits,) + decoder_outputs[1:] decoder_outputs = (logits,) + decoder_outputs[1:]
if labels is not None:
loss = self.compute_loss(labels, logits)
decoder_outputs = (loss,) + decoder_outputs
return decoder_outputs + encoder_outputs return decoder_outputs + encoder_outputs
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
import functools import functools
import logging import logging
import os import os
import warnings
import h5py import h5py
import numpy as np import numpy as np
...@@ -107,6 +108,19 @@ def keras_serializable(cls): ...@@ -107,6 +108,19 @@ def keras_serializable(cls):
return cls return cls
class TFCausalLanguageModelingLoss:
def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE
)
# make sure only labels that are not equal to -100
# are taken into account as loss
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
return loss_fn(labels, reduced_logits)
class TFQuestionAnsweringLoss: class TFQuestionAnsweringLoss:
def compute_loss(self, labels, logits): def compute_loss(self, labels, logits):
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
...@@ -123,7 +137,13 @@ class TFTokenClassificationLoss: ...@@ -123,7 +137,13 @@ class TFTokenClassificationLoss:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction=tf.keras.losses.Reduction.NONE from_logits=True, reduction=tf.keras.losses.Reduction.NONE
) )
active_loss = tf.reshape(labels, (-1,)) != -1 # make sure only labels that are not equal to -100
# are taken into account as loss
if tf.math.reduce_any(labels == -1).numpy() is True:
warnings.warn("Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead.")
active_loss = tf.reshape(labels, (-1,)) != -1
else:
active_loss = tf.reshape(labels, (-1,)) != -100
reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
...@@ -143,6 +163,7 @@ class TFSequenceClassificationLoss: ...@@ -143,6 +163,7 @@ class TFSequenceClassificationLoss:
TFMultipleChoiceLoss = TFSequenceClassificationLoss TFMultipleChoiceLoss = TFSequenceClassificationLoss
TFMaskedLanguageModelingLoss = TFCausalLanguageModelingLoss
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
......
...@@ -30,6 +30,7 @@ from .file_utils import ( ...@@ -30,6 +30,7 @@ from .file_utils import (
add_start_docstrings_to_callable, add_start_docstrings_to_callable,
) )
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFCausalLanguageModelingLoss,
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -871,7 +872,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -871,7 +872,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
(linear layer with weights tied to the input embeddings). """, (linear layer with weights tied to the input embeddings). """,
XLNET_START_DOCSTRING, XLNET_START_DOCSTRING,
) )
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
...@@ -912,8 +913,28 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -912,8 +913,28 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
return inputs return inputs
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs): def call(
self,
inputs,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``.
Return: Return:
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
...@@ -957,12 +978,40 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -957,12 +978,40 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
if len(inputs) > 12:
inputs = inputs[:12]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
inputs,
attention_mask=None,
mems=None,
perm_mask=None,
target_mapping=None,
token_type_ids=None,
input_mask=None,
head_mask=None,
inputs_embeds=None,
use_cache=True,
output_attentions=None,
output_hidden_states=None,
training=training,
)
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state) logits = self.lm_loss(hidden_state, training=training)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
if labels is not None:
# shift labels to the left and cut last logit token
logits = logits[:, :-1]
labels = labels[:, 1:]
loss = self.compute_loss(labels, logits)
outputs = (loss,) + outputs
return outputs # return logits, (mems), (hidden states), (attentions) return outputs # return logits, (mems), (hidden states), (attentions)
......
...@@ -1041,9 +1041,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1041,9 +1041,9 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
use_cache=True, use_cache=True,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import unittest import unittest
from transformers import is_torch_available from transformers import is_torch_available
from transformers.testing_utils import require_torch, torch_device from transformers.testing_utils import require_torch, slow, torch_device
from .test_configuration_common import ConfigTester from .test_configuration_common import ConfigTester
from .test_modeling_common import ModelTesterMixin, ids_tensor from .test_modeling_common import ModelTesterMixin, ids_tensor
...@@ -32,6 +32,7 @@ if is_torch_available(): ...@@ -32,6 +32,7 @@ if is_torch_available():
DistilBertForTokenClassification, DistilBertForTokenClassification,
DistilBertForQuestionAnswering, DistilBertForQuestionAnswering,
DistilBertForSequenceClassification, DistilBertForSequenceClassification,
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
) )
class DistilBertModelTester(object): class DistilBertModelTester(object):
...@@ -276,8 +277,8 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase): ...@@ -276,8 +277,8 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs) self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
# @slow @slow
# def test_model_from_pretrained(self): def test_model_from_pretrained(self):
# for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
# model = DistilBertModel.from_pretrained(model_name) model = DistilBertModel.from_pretrained(model_name)
# self.assertIsNotNone(model) self.assertIsNotNone(model)
...@@ -24,6 +24,8 @@ if is_tf_available(): ...@@ -24,6 +24,8 @@ if is_tf_available():
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
BertConfig, BertConfig,
GPT2Config,
T5Config,
TFAutoModel, TFAutoModel,
TFBertModel, TFBertModel,
TFAutoModelForPreTraining, TFAutoModelForPreTraining,
...@@ -35,6 +37,25 @@ if is_tf_available(): ...@@ -35,6 +37,25 @@ if is_tf_available():
TFBertForSequenceClassification, TFBertForSequenceClassification,
TFAutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering,
TFBertForQuestionAnswering, TFBertForQuestionAnswering,
TFAutoModelForCausalLM,
TFGPT2LMHeadModel,
TFAutoModelForMaskedLM,
TFAutoModelForSeq2SeqLM,
TFT5ForConditionalGeneration,
)
from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.modeling_tf_auto import (
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
) )
...@@ -72,10 +93,21 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -72,10 +93,21 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForPreTraining) self.assertIsInstance(model, TFBertForPreTraining)
@slow
def test_model_for_causal_lm(self):
for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, GPT2Config)
model = TFAutoModelForCausalLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFGPT2LMHeadModel)
@slow @slow
def test_lmhead_model_from_pretrained(self): def test_lmhead_model_from_pretrained(self):
# for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
for model_name in ["bert-base-uncased"]:
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config) self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig) self.assertIsInstance(config, BertConfig)
...@@ -84,6 +116,30 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -84,6 +116,30 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForMaskedLM) self.assertIsInstance(model, TFBertForMaskedLM)
@slow
def test_model_for_masked_lm(self):
for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, BertConfig)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForMaskedLM)
@slow
def test_model_for_encoder_decoder_lm(self):
for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
config = AutoConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, T5Config)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, TFT5ForConditionalGeneration)
@slow @slow
def test_sequence_classification_model_from_pretrained(self): def test_sequence_classification_model_from_pretrained(self):
# for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
...@@ -119,3 +175,28 @@ class TFAutoModelTest(unittest.TestCase): ...@@ -119,3 +175,28 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsInstance(model, TFRobertaForMaskedLM) self.assertIsInstance(model, TFRobertaForMaskedLM)
self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(), 14830)
self.assertEqual(model.num_parameters(only_trainable=True), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830)
def test_parents_and_children_in_mappings(self):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings = (
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_MASKED_LM_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
)
for mapping in mappings:
mapping = tuple(mapping.items())
for index, (child_config, child_model) in enumerate(mapping[1:]):
for parent_config, parent_model in mapping[: index + 1]:
with self.subTest(
msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
):
self.assertFalse(issubclass(child_config, parent_config))
self.assertFalse(issubclass(child_model, parent_model))
...@@ -27,6 +27,7 @@ if is_tf_available(): ...@@ -27,6 +27,7 @@ if is_tf_available():
import tensorflow as tf import tensorflow as tf
from transformers.modeling_tf_bert import ( from transformers.modeling_tf_bert import (
TFBertModel, TFBertModel,
TFBertLMHeadModel,
TFBertForMaskedLM, TFBertForMaskedLM,
TFBertForNextSentencePrediction, TFBertForNextSentencePrediction,
TFBertForPreTraining, TFBertForPreTraining,
...@@ -142,11 +143,30 @@ class TFBertModelTester: ...@@ -142,11 +143,30 @@ class TFBertModelTester:
) )
self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
def create_and_check_bert_lm_head(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
):
config.is_decoder = True
model = TFBertLMHeadModel(config=config)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(prediction_scores,) = model(inputs)
self.parent.assertListEqual(
list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
)
def create_and_check_bert_for_masked_lm( def create_and_check_bert_for_masked_lm(
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForMaskedLM(config=config) model = TFBertForMaskedLM(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(prediction_scores,) = model(inputs) (prediction_scores,) = model(inputs)
result = { result = {
"prediction_scores": prediction_scores.numpy(), "prediction_scores": prediction_scores.numpy(),
...@@ -186,11 +206,14 @@ class TFBertModelTester: ...@@ -186,11 +206,14 @@ class TFBertModelTester:
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFBertForSequenceClassification(config=config) model = TFBertForSequenceClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
(logits,) = model(inputs) "input_ids": input_ids,
result = { "attention_mask": input_mask,
"logits": logits.numpy(), "token_type_ids": token_type_ids,
} }
(logits,) = model(inputs)
result = {"logits": logits.numpy()}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
def create_and_check_bert_for_multiple_choice( def create_and_check_bert_for_multiple_choice(
...@@ -207,9 +230,7 @@ class TFBertModelTester: ...@@ -207,9 +230,7 @@ class TFBertModelTester:
"token_type_ids": multiple_choice_token_type_ids, "token_type_ids": multiple_choice_token_type_ids,
} }
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {"logits": logits.numpy()}
"logits": logits.numpy(),
}
self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
def create_and_check_bert_for_token_classification( def create_and_check_bert_for_token_classification(
...@@ -217,7 +238,11 @@ class TFBertModelTester: ...@@ -217,7 +238,11 @@ class TFBertModelTester:
): ):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = TFBertForTokenClassification(config=config) model = TFBertForTokenClassification(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": token_type_ids,
}
(logits,) = model(inputs) (logits,) = model(inputs)
result = { result = {
"logits": logits.numpy(), "logits": logits.numpy(),
...@@ -228,12 +253,14 @@ class TFBertModelTester: ...@@ -228,12 +253,14 @@ class TFBertModelTester:
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
): ):
model = TFBertForQuestionAnswering(config=config) model = TFBertForQuestionAnswering(config=config)
inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} inputs = {
start_logits, end_logits = model(inputs) "input_ids": input_ids,
result = { "attention_mask": input_mask,
"start_logits": start_logits.numpy(), "token_type_ids": token_type_ids,
"end_logits": end_logits.numpy(),
} }
start_logits, end_logits = model(inputs)
result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
...@@ -285,6 +312,10 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase): ...@@ -285,6 +312,10 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs) self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
def test_for_causal_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_lm_head(*config_and_inputs)
def test_for_multiple_choice(self): def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment