Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
...@@ -40,8 +40,8 @@ RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -40,8 +40,8 @@ RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class RetriBertPreTrainedModel(PreTrainedModel): class RetriBertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = RetriBertConfig config_class = RetriBertConfig
...@@ -73,7 +73,8 @@ RETRIBERT_START_DOCSTRING = r""" ...@@ -73,7 +73,8 @@ RETRIBERT_START_DOCSTRING = r"""
@add_start_docstrings( @add_start_docstrings(
"""Bert Based model to embed queries or document for document retreival. """, RETRIBERT_START_DOCSTRING, """Bert Based model to embed queries or document for document retreival. """,
RETRIBERT_START_DOCSTRING,
) )
class RetriBertModel(RetriBertPreTrainedModel): class RetriBertModel(RetriBertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
...@@ -91,7 +92,11 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -91,7 +92,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
self.init_weights() self.init_weights()
def embed_sentences_checkpointed( def embed_sentences_checkpointed(
self, input_ids, attention_mask, sent_encoder, checkpoint_batch_size=-1, self,
input_ids,
attention_mask,
sent_encoder,
checkpoint_batch_size=-1,
): ):
# reproduces BERT forward pass with checkpointing # reproduces BERT forward pass with checkpointing
if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size: if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
...@@ -108,7 +113,11 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -108,7 +113,11 @@ class RetriBertModel(RetriBertPreTrainedModel):
# define function for cehckpointing # define function for cehckpointing
def partial_encode(*inputs): def partial_encode(*inputs):
encoder_outputs = sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,) encoder_outputs = sent_encoder.encoder(
inputs[0],
attention_mask=inputs[1],
head_mask=head_mask,
)
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = sent_encoder.pooler(sequence_output) pooled_output = sent_encoder.pooler(sequence_output)
return pooled_output return pooled_output
...@@ -127,13 +136,24 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -127,13 +136,24 @@ class RetriBertModel(RetriBertPreTrainedModel):
return torch.cat(pooled_output_list, dim=0) return torch.cat(pooled_output_list, dim=0)
def embed_questions( def embed_questions(
self, input_ids, attention_mask=None, checkpoint_batch_size=-1, self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
): ):
q_reps = self.embed_sentences_checkpointed(input_ids, attention_mask, self.bert_query, checkpoint_batch_size,) q_reps = self.embed_sentences_checkpointed(
input_ids,
attention_mask,
self.bert_query,
checkpoint_batch_size,
)
return self.project_query(q_reps) return self.project_query(q_reps)
def embed_answers( def embed_answers(
self, input_ids, attention_mask=None, checkpoint_batch_size=-1, self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
): ):
a_reps = self.embed_sentences_checkpointed( a_reps = self.embed_sentences_checkpointed(
input_ids, input_ids,
...@@ -147,33 +167,33 @@ class RetriBertModel(RetriBertPreTrainedModel): ...@@ -147,33 +167,33 @@ class RetriBertModel(RetriBertPreTrainedModel):
self, input_ids_query, attention_mask_query, input_ids_doc, attention_mask_doc, checkpoint_batch_size=-1 self, input_ids_query, attention_mask_query, input_ids_doc, attention_mask_doc, checkpoint_batch_size=-1
): ):
r""" r"""
Args: Args:
input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): input_ids_query (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the queries in a batch. Indices of input sequence tokens in the vocabulary for the queries in a batch.
Indices can be obtained using :class:`transformers.RetriBertTokenizer`. Indices can be obtained using :class:`transformers.RetriBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.__call__` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on queries padding token indices. Mask to avoid performing attention on queries padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): input_ids_doc (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the documents in a batch. Indices of input sequence tokens in the vocabulary for the documents in a batch.
attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask_doc (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on documents padding token indices. Mask to avoid performing attention on documents padding token indices.
checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`): checkpoint_batch_size (:obj:`int`, `optional`, defaults to `:obj:`-1`):
If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time If greater than 0, uses gradient checkpointing to only compute sequence representation on checkpoint_batch_size examples at a time
on the GPU. All query representations are still compared to all document representations in the batch. on the GPU. All query representations are still compared to all document representations in the batch.
Return: Return:
:obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document :obj:`torch.FloatTensor` the bi-directional cross-entropy loss obtained while trying to match each query to its corresponding document
and each cocument to its corresponding query in the batch and each cocument to its corresponding query in the batch
""" """
device = input_ids_query.device device = input_ids_query.device
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size) q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
......
...@@ -83,7 +83,7 @@ class RobertaEmbeddings(BertEmbeddings): ...@@ -83,7 +83,7 @@ class RobertaEmbeddings(BertEmbeddings):
) )
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def create_position_ids_from_inputs_embeds(self, inputs_embeds):
""" We are provided embeddings directly. We cannot infer which are padded so just generate """We are provided embeddings directly. We cannot infer which are padded so just generate
sequential position ids. sequential position ids.
:param torch.Tensor inputs_embeds: :param torch.Tensor inputs_embeds:
...@@ -220,36 +220,36 @@ class RobertaForCausalLM(BertPreTrainedModel): ...@@ -220,36 +220,36 @@ class RobertaForCausalLM(BertPreTrainedModel):
return_dict=None, return_dict=None,
): ):
r""" r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder. if the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask Mask to avoid performing attention on the padding token indices of the encoder input. This mask
is used in the cross-attention if the model is configured as a decoder. is used in the cross-attention if the model is configured as a decoder.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the left-to-right language modeling loss (next word prediction). Labels for computing the left-to-right language modeling loss (next word prediction).
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
Returns: Returns:
Example:: Example::
>>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig >>> from transformers import RobertaTokenizer, RobertaLMHeadModel, RobertaConfig
>>> import torch >>> import torch
>>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base') >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
>>> config = RobertaConfig.from_pretrained("roberta-base") >>> config = RobertaConfig.from_pretrained("roberta-base")
>>> config.is_decoder = True >>> config.is_decoder = True
>>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True) >>> model = RobertaLMHeadModel.from_pretrained('roberta-base', config=config, return_dict=True)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> prediction_logits = outputs.logits >>> prediction_logits = outputs.logits
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -283,7 +283,10 @@ class RobertaForCausalLM(BertPreTrainedModel): ...@@ -283,7 +283,10 @@ class RobertaForCausalLM(BertPreTrainedModel):
return ((lm_loss,) + output) if lm_loss is not None else output return ((lm_loss,) + output) if lm_loss is not None else output
return CausalLMOutput( return CausalLMOutput(
loss=lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=lm_loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
...@@ -493,7 +496,10 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -493,7 +496,10 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput( return SequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -581,7 +587,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -581,7 +587,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return MultipleChoiceModelOutput( return MultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -667,7 +676,10 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -667,7 +676,10 @@ class RobertaForTokenClassification(BertPreTrainedModel):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput( return TokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -791,7 +803,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -791,7 +803,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
def create_position_ids_from_input_ids(input_ids, padding_idx): def create_position_ids_from_input_ids(input_ids, padding_idx):
""" Replace non-padding symbols with their position numbers. Position numbers begin at """Replace non-padding symbols with their position numbers. Position numbers begin at
padding_idx+1. Padding symbols are ignored. This is modified from fairseq's padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
`utils.make_positions`. `utils.make_positions`.
......
...@@ -62,8 +62,7 @@ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -62,8 +62,7 @@ T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 # More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
#################################################### ####################################################
def load_tf_weights_in_t5(model, config, tf_checkpoint_path): def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
""" Load tf checkpoints in a pytorch model. """Load tf checkpoints in a pytorch model."""
"""
try: try:
import re import re
...@@ -156,8 +155,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): ...@@ -156,8 +155,8 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
class T5LayerNorm(nn.Module): class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6): def __init__(self, hidden_size, eps=1e-6):
""" Construct a layernorm module in the T5 style """Construct a layernorm module in the T5 style
No bias and no substraction of mean. No bias and no substraction of mean.
""" """
super().__init__() super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size)) self.weight = nn.Parameter(torch.ones(hidden_size))
...@@ -569,8 +568,8 @@ class T5Block(nn.Module): ...@@ -569,8 +568,8 @@ class T5Block(nn.Module):
class T5PreTrainedModel(PreTrainedModel): class T5PreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = T5Config config_class = T5Config
...@@ -913,9 +912,9 @@ class T5Model(T5PreTrainedModel): ...@@ -913,9 +912,9 @@ class T5Model(T5PreTrainedModel):
return self.decoder return self.decoder
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
...@@ -940,19 +939,19 @@ class T5Model(T5PreTrainedModel): ...@@ -940,19 +939,19 @@ class T5Model(T5PreTrainedModel):
**kwargs, **kwargs,
): ):
r""" r"""
Returns: Returns:
Example:: Example::
>>> from transformers import T5Tokenizer, T5Model >>> from transformers import T5Tokenizer, T5Model
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5Model.from_pretrained('t5-small') >>> model = T5Model.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids) >>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
if "decoder_past_key_value_states" in kwargs: if "decoder_past_key_value_states" in kwargs:
warnings.warn( warnings.warn(
...@@ -1093,31 +1092,31 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1093,31 +1092,31 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
**kwargs, **kwargs,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated. Used to hide legacy arguments that have been deprecated.
Returns: Returns:
Examples:: Examples::
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, labels=input_ids) >>> outputs = model(input_ids=input_ids, labels=input_ids)
>>> loss = outputs.loss >>> loss = outputs.loss
>>> logits = outputs.logits >>> logits = outputs.logits
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model.generate(input_ids) >>> outputs = model.generate(input_ids)
""" """
if "lm_labels" in kwargs: if "lm_labels" in kwargs:
......
...@@ -74,8 +74,7 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -74,8 +74,7 @@ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFAlbertEmbeddings(tf.keras.layers.Layer): class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings."""
"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -169,10 +168,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ...@@ -169,10 +168,10 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
def _linear(self, inputs): def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer. """Computes logits by running inputs through a linear layer.
Args: Args:
inputs: A float32 tensor with shape [batch_size, length, embedding_size] inputs: A float32 tensor with shape [batch_size, length, embedding_size]
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = shape_list(inputs)[0] batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1] length = shape_list(inputs)[1]
...@@ -478,8 +477,8 @@ class TFAlbertTransformer(tf.keras.layers.Layer): ...@@ -478,8 +477,8 @@ class TFAlbertTransformer(tf.keras.layers.Layer):
class TFAlbertPreTrainedModel(TFPreTrainedModel): class TFAlbertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = AlbertConfig config_class = AlbertConfig
...@@ -551,9 +550,9 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -551,9 +550,9 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
raise NotImplementedError raise NotImplementedError
...@@ -655,7 +654,10 @@ class TFAlbertMainLayer(tf.keras.layers.Layer): ...@@ -655,7 +654,10 @@ class TFAlbertMainLayer(tf.keras.layers.Layer):
pooled_output = self.pooler(sequence_output[:, 0]) pooled_output = self.pooler(sequence_output[:, 0])
if not return_dict: if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:] return (
sequence_output,
pooled_output,
) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling( return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
...@@ -821,16 +823,16 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): ...@@ -821,16 +823,16 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
@replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
Examples:: Examples::
import tensorflow as tf import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForPreTraining from transformers import AlbertTokenizer, TFAlbertForPreTraining
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForPreTraining.from_pretrained('albert-base-v2') model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
prediction_scores, sop_scores = outputs[:2] prediction_scores, sop_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict") return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.albert.return_dict return_dict = return_dict if return_dict is not None else self.albert.return_dict
...@@ -856,7 +858,9 @@ class TFAlbertSOPHead(tf.keras.layers.Layer): ...@@ -856,7 +858,9 @@ class TFAlbertSOPHead(tf.keras.layers.Layer):
self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier", config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="classifier",
) )
def call(self, pooled_output, training: bool): def call(self, pooled_output, training: bool):
...@@ -935,7 +939,10 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss) ...@@ -935,7 +939,10 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss)
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1016,7 +1023,10 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -1016,7 +1023,10 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1095,7 +1105,10 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1095,7 +1105,10 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1211,7 +1224,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1211,7 +1224,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
...@@ -1316,5 +1329,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1316,5 +1329,8 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -310,27 +310,27 @@ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( ...@@ -310,27 +310,27 @@ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
class TFAutoModel(object): class TFAutoModel(object):
r""" r"""
:class:`~transformers.TFAutoModel` is a generic model class :class:`~transformers.TFAutoModel` is a generic model class
that will be instantiated as one of the base model classes of the library that will be instantiated as one of the base model classes of the library
when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string: falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `t5`: TFT5Model (T5 model) - `t5`: TFT5Model (T5 model)
- `distilbert`: TFDistilBertModel (DistilBERT model) - `distilbert`: TFDistilBertModel (DistilBERT model)
- `roberta`: TFRobertaModel (RoBERTa model) - `roberta`: TFRobertaModel (RoBERTa model)
- `bert`: TFBertModel (Bert model) - `bert`: TFBertModel (Bert model)
- `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model) - `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
- `gpt2`: TFGPT2Model (OpenAI GPT-2 model) - `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
- `transfo-xl`: TFTransfoXLModel (Transformer-XL model) - `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
- `xlnet`: TFXLNetModel (XLNet model) - `xlnet`: TFXLNetModel (XLNet model)
- `xlm`: TFXLMModel (XLM model) - `xlm`: TFXLMModel (XLM model)
- `ctrl`: TFCTRLModel (CTRL model) - `ctrl`: TFCTRLModel (CTRL model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -342,7 +342,7 @@ class TFAutoModel(object): ...@@ -342,7 +342,7 @@ class TFAutoModel(object):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -381,7 +381,7 @@ class TFAutoModel(object): ...@@ -381,7 +381,7 @@ class TFAutoModel(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -477,11 +477,11 @@ class TFAutoModel(object): ...@@ -477,11 +477,11 @@ class TFAutoModel(object):
class TFAutoModelForPreTraining(object): class TFAutoModelForPreTraining(object):
r""" r"""
:class:`~transformers.TFAutoModelForPreTraining` is a generic model class :class:`~transformers.TFAutoModelForPreTraining` is a generic model class
that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)` that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `TFAutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -493,7 +493,7 @@ class TFAutoModelForPreTraining(object): ...@@ -493,7 +493,7 @@ class TFAutoModelForPreTraining(object):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -532,7 +532,7 @@ class TFAutoModelForPreTraining(object): ...@@ -532,7 +532,7 @@ class TFAutoModelForPreTraining(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration. r"""Instantiates one of the model classes of the library -with the architecture used for pretraining this model– from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
...@@ -630,27 +630,27 @@ class TFAutoModelForPreTraining(object): ...@@ -630,27 +630,27 @@ class TFAutoModelForPreTraining(object):
class TFAutoModelWithLMHead(object): class TFAutoModelWithLMHead(object):
r""" r"""
:class:`~transformers.TFAutoModelWithLMHead` is a generic model class :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
that will be instantiated as one of the language modeling model classes of the library that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string: falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `t5`: TFT5ForConditionalGeneration (T5 model) - `t5`: TFT5ForConditionalGeneration (T5 model)
- `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
- `roberta`: TFRobertaForMaskedLM (RoBERTa model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model)
- `bert`: TFBertForMaskedLM (Bert model) - `bert`: TFBertForMaskedLM (Bert model)
- `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model) - `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
- `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model) - `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
- `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model) - `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
- `xlnet`: TFXLNetLMHeadModel (XLNet model) - `xlnet`: TFXLNetLMHeadModel (XLNet model)
- `xlm`: TFXLMWithLMHeadModel (XLM model) - `xlm`: TFXLMWithLMHeadModel (XLM model)
- `ctrl`: TFCTRLLMHeadModel (CTRL model) - `ctrl`: TFCTRLLMHeadModel (CTRL model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -662,7 +662,7 @@ class TFAutoModelWithLMHead(object): ...@@ -662,7 +662,7 @@ class TFAutoModelWithLMHead(object):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -705,7 +705,7 @@ class TFAutoModelWithLMHead(object): ...@@ -705,7 +705,7 @@ class TFAutoModelWithLMHead(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library r"""Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -808,18 +808,18 @@ class TFAutoModelWithLMHead(object): ...@@ -808,18 +808,18 @@ class TFAutoModelWithLMHead(object):
class TFAutoModelForMultipleChoice: class TFAutoModelForMultipleChoice:
r""" r"""
:class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class :class:`~transformers.TFAutoModelForMultipleChoice` is a generic model class
that will be instantiated as one of the multiple choice model classes of the library that will be instantiated as one of the multiple choice model classes of the library
when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string: falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `albert`: TFAlbertForMultipleChoice (Albert model) - `albert`: TFAlbertForMultipleChoice (Albert model)
- `bert`: TFBertForMultipleChoice (Bert model) - `bert`: TFBertForMultipleChoice (Bert model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -831,7 +831,7 @@ class TFAutoModelForMultipleChoice: ...@@ -831,7 +831,7 @@ class TFAutoModelForMultipleChoice:
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -864,7 +864,7 @@ class TFAutoModelForMultipleChoice: ...@@ -864,7 +864,7 @@ class TFAutoModelForMultipleChoice:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the multiple choice model classes of the library r"""Instantiates one of the multiple choice model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -958,12 +958,12 @@ class TFAutoModelForMultipleChoice: ...@@ -958,12 +958,12 @@ class TFAutoModelForMultipleChoice:
class TFAutoModelForCausalLM: class TFAutoModelForCausalLM:
r""" r"""
:class:`~transformers.TFAutoModelForCausalLM` is a generic model class :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -975,7 +975,7 @@ class TFAutoModelForCausalLM: ...@@ -975,7 +975,7 @@ class TFAutoModelForCausalLM:
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1011,7 +1011,7 @@ class TFAutoModelForCausalLM: ...@@ -1011,7 +1011,7 @@ class TFAutoModelForCausalLM:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library r"""Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -1093,12 +1093,12 @@ class TFAutoModelForCausalLM: ...@@ -1093,12 +1093,12 @@ class TFAutoModelForCausalLM:
class TFAutoModelForMaskedLM: class TFAutoModelForMaskedLM:
r""" r"""
:class:`~transformers.TFAutoModelForMaskedLM` is a generic model class :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -1110,7 +1110,7 @@ class TFAutoModelForMaskedLM: ...@@ -1110,7 +1110,7 @@ class TFAutoModelForMaskedLM:
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1149,7 +1149,7 @@ class TFAutoModelForMaskedLM: ...@@ -1149,7 +1149,7 @@ class TFAutoModelForMaskedLM:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library r"""Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -1235,12 +1235,12 @@ class TFAutoModelForMaskedLM: ...@@ -1235,12 +1235,12 @@ class TFAutoModelForMaskedLM:
class TFAutoModelForSeq2SeqLM: class TFAutoModelForSeq2SeqLM:
r""" r"""
:class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
that will be instantiated as one of the language modeling model classes of the library that will be instantiated as one of the language modeling model classes of the library
when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -1252,7 +1252,7 @@ class TFAutoModelForSeq2SeqLM: ...@@ -1252,7 +1252,7 @@ class TFAutoModelForSeq2SeqLM:
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1285,7 +1285,7 @@ class TFAutoModelForSeq2SeqLM: ...@@ -1285,7 +1285,7 @@ class TFAutoModelForSeq2SeqLM:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the language modeling model classes of the library r"""Instantiates one of the language modeling model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -1364,22 +1364,22 @@ class TFAutoModelForSeq2SeqLM: ...@@ -1364,22 +1364,22 @@ class TFAutoModelForSeq2SeqLM:
class TFAutoModelForSequenceClassification(object): class TFAutoModelForSequenceClassification(object):
r""" r"""
:class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
that will be instantiated as one of the sequence classification model classes of the library that will be instantiated as one of the sequence classification model classes of the library
when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string: falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
- `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
- `bert`: TFBertForSequenceClassification (Bert model) - `bert`: TFBertForSequenceClassification (Bert model)
- `xlnet`: TFXLNetForSequenceClassification (XLNet model) - `xlnet`: TFXLNetForSequenceClassification (XLNet model)
- `xlm`: TFXLMForSequenceClassification (XLM model) - `xlm`: TFXLMForSequenceClassification (XLM model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -1391,7 +1391,7 @@ class TFAutoModelForSequenceClassification(object): ...@@ -1391,7 +1391,7 @@ class TFAutoModelForSequenceClassification(object):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1428,7 +1428,7 @@ class TFAutoModelForSequenceClassification(object): ...@@ -1428,7 +1428,7 @@ class TFAutoModelForSequenceClassification(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the sequence classification model classes of the library r"""Instantiates one of the sequence classification model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -1525,23 +1525,23 @@ class TFAutoModelForSequenceClassification(object): ...@@ -1525,23 +1525,23 @@ class TFAutoModelForSequenceClassification(object):
class TFAutoModelForQuestionAnswering(object): class TFAutoModelForQuestionAnswering(object):
r""" r"""
:class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
that will be instantiated as one of the question answering model classes of the library that will be instantiated as one of the question answering model classes of the library
when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
class method. class method.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
based on the `model_type` property of the config object, or when it's missing, based on the `model_type` property of the config object, or when it's missing,
falling back to using pattern matching on the `pretrained_model_name_or_path` string: falling back to using pattern matching on the `pretrained_model_name_or_path` string:
- `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
- `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model)
- `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model)
- `bert`: TFBertForQuestionAnswering (Bert model) - `bert`: TFBertForQuestionAnswering (Bert model)
- `xlnet`: TFXLNetForQuestionAnswering (XLNet model) - `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
- `xlm`: TFXLMForQuestionAnswering (XLM model) - `xlm`: TFXLMForQuestionAnswering (XLM model)
This class cannot be instantiated using `__init__()` (throws an error). This class cannot be instantiated using `__init__()` (throws an error).
""" """
def __init__(self): def __init__(self):
...@@ -1553,7 +1553,7 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1553,7 +1553,7 @@ class TFAutoModelForQuestionAnswering(object):
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1591,7 +1591,7 @@ class TFAutoModelForQuestionAnswering(object): ...@@ -1591,7 +1591,7 @@ class TFAutoModelForQuestionAnswering(object):
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the question answering model classes of the library r"""Instantiates one of the question answering model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
...@@ -1697,7 +1697,7 @@ class TFAutoModelForTokenClassification: ...@@ -1697,7 +1697,7 @@ class TFAutoModelForTokenClassification:
@classmethod @classmethod
def from_config(cls, config): def from_config(cls, config):
r""" Instantiates one of the base model classes of the library r"""Instantiates one of the base model classes of the library
from a configuration. from a configuration.
Note: Note:
...@@ -1733,7 +1733,7 @@ class TFAutoModelForTokenClassification: ...@@ -1733,7 +1733,7 @@ class TFAutoModelForTokenClassification:
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r""" Instantiates one of the question answering model classes of the library r"""Instantiates one of the question answering model classes of the library
from a pre-trained model configuration. from a pre-trained model configuration.
The `from_pretrained()` method takes care of returning the correct model class instance The `from_pretrained()` method takes care of returning the correct model class instance
......
...@@ -89,7 +89,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -89,7 +89,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
def gelu(x): def gelu(x):
""" Gaussian Error Linear Unit. """Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created. Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
...@@ -127,8 +127,7 @@ ACT2FN = { ...@@ -127,8 +127,7 @@ ACT2FN = {
class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings."""
"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -225,10 +224,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ...@@ -225,10 +224,10 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
def _linear(self, inputs): def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer. """Computes logits by running inputs through a linear layer.
Args: Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = shape_list(inputs)[0] batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1] length = shape_list(inputs)[1]
...@@ -551,9 +550,9 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -551,9 +550,9 @@ class TFBertMainLayer(tf.keras.layers.Layer):
self.embeddings.vocab_size = value.shape[0] self.embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
raise NotImplementedError raise NotImplementedError
...@@ -656,7 +655,10 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -656,7 +655,10 @@ class TFBertMainLayer(tf.keras.layers.Layer):
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
if not return_dict: if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:] return (
sequence_output,
pooled_output,
) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling( return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
...@@ -667,8 +669,8 @@ class TFBertMainLayer(tf.keras.layers.Layer): ...@@ -667,8 +669,8 @@ class TFBertMainLayer(tf.keras.layers.Layer):
class TFBertPreTrainedModel(TFPreTrainedModel): class TFBertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = BertConfig config_class = BertConfig
...@@ -824,18 +826,18 @@ class TFBertForPreTraining(TFBertPreTrainedModel): ...@@ -824,18 +826,18 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
@replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
Examples:: Examples::
import tensorflow as tf import tensorflow as tf
from transformers import BertTokenizer, TFBertForPreTraining from transformers import BertTokenizer, TFBertForPreTraining
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForPreTraining.from_pretrained('bert-base-uncased') model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2] prediction_scores, seq_relationship_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict") return_dict = kwargs.get("return_dict")
...@@ -933,7 +935,10 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): ...@@ -933,7 +935,10 @@ class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1011,12 +1016,16 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1011,12 +1016,16 @@ class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFCausalLMOutput( return TFCausalLMOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
@add_start_docstrings( @add_start_docstrings(
"""Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, """Bert Model with a `next sentence prediction (classification)` head on top. """,
BERT_START_DOCSTRING,
) )
class TFBertForNextSentencePrediction(TFBertPreTrainedModel): class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
...@@ -1029,22 +1038,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -1029,22 +1038,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
Examples:: Examples::
import tensorflow as tf import tensorflow as tf
from transformers import BertTokenizer, TFBertForNextSentencePrediction from transformers import BertTokenizer, TFBertForNextSentencePrediction
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light." next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer(prompt, next_sentence, return_tensors='tf') encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random assert logits[0][0] < logits[0][1] # the next sentence was random
""" """
return_dict = kwargs.get("return_dict") return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.bert.return_dict return_dict = return_dict if return_dict is not None else self.bert.return_dict
...@@ -1057,7 +1066,9 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): ...@@ -1057,7 +1066,9 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
return (seq_relationship_score,) + outputs[2:] return (seq_relationship_score,) + outputs[2:]
return TFNextSentencePredictorOutput( return TFNextSentencePredictorOutput(
logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1138,7 +1149,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -1138,7 +1149,10 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1159,7 +1173,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1159,7 +1173,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
...@@ -1261,7 +1275,10 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1261,7 +1275,10 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1340,7 +1357,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1340,7 +1357,10 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
......
...@@ -77,7 +77,8 @@ class TFCamembertModel(TFRobertaModel): ...@@ -77,7 +77,8 @@ class TFCamembertModel(TFRobertaModel):
@add_start_docstrings( @add_start_docstrings(
"""CamemBERT Model with a `language modeling` head on top. """, CAMEMBERT_START_DOCSTRING, """CamemBERT Model with a `language modeling` head on top. """,
CAMEMBERT_START_DOCSTRING,
) )
class TFCamembertForMaskedLM(TFRobertaForMaskedLM): class TFCamembertForMaskedLM(TFRobertaForMaskedLM):
""" """
......
...@@ -245,8 +245,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -245,8 +245,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
""" """
raise NotImplementedError raise NotImplementedError
...@@ -426,8 +426,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): ...@@ -426,8 +426,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer):
class TFCTRLPreTrainedModel(TFPreTrainedModel): class TFCTRLPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = CTRLConfig config_class = CTRLConfig
......
...@@ -70,7 +70,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -70,7 +70,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
def gelu(x): def gelu(x):
""" Gaussian Error Linear Unit. """Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created. Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
...@@ -177,10 +177,10 @@ class TFEmbeddings(tf.keras.layers.Layer): ...@@ -177,10 +177,10 @@ class TFEmbeddings(tf.keras.layers.Layer):
def _linear(self, inputs): def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer. """Computes logits by running inputs through a linear layer.
Args: Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = shape_list(inputs)[0] batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1] length = shape_list(inputs)[1]
...@@ -518,8 +518,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): ...@@ -518,8 +518,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer):
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # # INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class TFDistilBertPreTrainedModel(TFPreTrainedModel): class TFDistilBertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = DistilBertConfig config_class = DistilBertConfig
...@@ -634,7 +634,8 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): ...@@ -634,7 +634,8 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
@add_start_docstrings( @add_start_docstrings(
"""DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, """DistilBert Model with a `masked language modeling` head on top. """,
DISTILBERT_START_DOCSTRING,
) )
class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss): class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
...@@ -875,7 +876,10 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -875,7 +876,10 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -902,7 +906,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -902,7 +906,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
......
...@@ -54,8 +54,7 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -54,8 +54,7 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFElectraEmbeddings(tf.keras.layers.Layer): class TFElectraEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings."""
"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -94,7 +93,13 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -94,7 +93,13 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
super().build(input_shape) super().build(input_shape)
def call( def call(
self, input_ids, position_ids=None, token_type_ids=None, inputs_embeds=None, mode="embedding", training=False, self,
input_ids,
position_ids=None,
token_type_ids=None,
inputs_embeds=None,
mode="embedding",
training=False,
): ):
"""Get token embeddings of inputs. """Get token embeddings of inputs.
Args: Args:
...@@ -144,10 +149,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ...@@ -144,10 +149,10 @@ class TFElectraEmbeddings(tf.keras.layers.Layer):
def _linear(self, inputs): def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer. """Computes logits by running inputs through a linear layer.
Args: Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = shape_list(inputs)[0] batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1] length = shape_list(inputs)[1]
...@@ -250,9 +255,9 @@ class TFElectraMainLayer(TFElectraPreTrainedModel): ...@@ -250,9 +255,9 @@ class TFElectraMainLayer(TFElectraPreTrainedModel):
raise NotImplementedError raise NotImplementedError
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
raise NotImplementedError raise NotImplementedError
...@@ -491,18 +496,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): ...@@ -491,18 +496,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
training=False, training=False,
): ):
r""" r"""
Returns: Returns:
Examples:: Examples::
import tensorflow as tf import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForPreTraining from transformers import ElectraTokenizer, TFElectraForPreTraining
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator') model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) outputs = model(input_ids)
scores = outputs[0] scores = outputs[0]
""" """
return_dict = return_dict if return_dict is not None else self.electra.config.return_dict return_dict = return_dict if return_dict is not None else self.electra.config.return_dict
...@@ -729,7 +734,10 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla ...@@ -729,7 +734,10 @@ class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceCla
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -752,7 +760,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) ...@@ -752,7 +760,7 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
...@@ -853,7 +861,10 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss) ...@@ -853,7 +861,10 @@ class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss)
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1020,7 +1031,10 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -1020,7 +1031,10 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
loss = self.compute_loss(labels, (start_logits, end_logits)) loss = self.compute_loss(labels, (start_logits, end_logits))
if not return_dict: if not return_dict:
output = (start_logits, end_logits,) + discriminator_hidden_states[1:] output = (
start_logits,
end_logits,
) + discriminator_hidden_states[1:]
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFQuestionAnsweringModelOutput( return TFQuestionAnsweringModelOutput(
......
...@@ -252,8 +252,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -252,8 +252,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
self.wte.vocab_size = self.wte.weight.shape[0] self.wte.vocab_size = self.wte.weight.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
""" """
raise NotImplementedError raise NotImplementedError
...@@ -417,8 +417,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): ...@@ -417,8 +417,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
class TFGPT2PreTrainedModel(TFPreTrainedModel): class TFGPT2PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = GPT2Config config_class = GPT2Config
...@@ -698,34 +698,34 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -698,34 +698,34 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
training=False, training=False,
): ):
r""" r"""
mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
Return: Return:
Examples:: Examples::
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2') >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
>>> # Add a [CLS] to the vocabulary (we should train it also!) >>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'}) >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices] >>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
>>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2 >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
>>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1 >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids) >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2] >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -55,9 +55,9 @@ TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -55,9 +55,9 @@ TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True): def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_sep_token=True):
""" """
Computes global attention mask by putting attention on all tokens Computes global attention mask by putting attention on all tokens
before `sep_token_id` if `before_sep_token is True` else after before `sep_token_id` if `before_sep_token is True` else after
`sep_token_id`. `sep_token_id`.
""" """
assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions" assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
...@@ -72,11 +72,14 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se ...@@ -72,11 +72,14 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se
) )
else: else:
# last token is separation token and should not be counted and in the middle are two separation tokens # last token is separation token and should not be counted and in the middle are two separation tokens
attention_mask = tf.cast( attention_mask = (
tf.broadcast_to(attention_mask, input_ids_shape) tf.cast(
> tf.broadcast_to(question_end_index + 1, input_ids_shape), tf.broadcast_to(attention_mask, input_ids_shape)
tf.dtypes.int32, > tf.broadcast_to(question_end_index + 1, input_ids_shape),
) * tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32) tf.dtypes.int32,
)
* tf.cast(tf.broadcast_to(attention_mask, input_ids_shape) < input_ids_shape[-1], tf.dtypes.int32)
)
return attention_mask return attention_mask
...@@ -130,7 +133,9 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -130,7 +133,9 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
self.one_sided_attn_window_size = attention_window // 2 self.one_sided_attn_window_size = attention_window // 2
def call( def call(
self, inputs, training=False, self,
inputs,
training=False,
): ):
""" """
LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`.
...@@ -433,7 +438,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -433,7 +438,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap): def _sliding_chunks_matmul_attn_probs_value(self, attn_probs, value, window_overlap):
"""Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. """Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors.
Returned tensor will be of the same shape as `attn_probs`""" Returned tensor will be of the same shape as `attn_probs`"""
batch_size, seq_len, num_heads, head_dim = shape_list(value) batch_size, seq_len, num_heads, head_dim = shape_list(value)
...@@ -508,17 +513,17 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -508,17 +513,17 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
@staticmethod @staticmethod
def _pad_and_diagonalize(chunked_hidden_states): def _pad_and_diagonalize(chunked_hidden_states):
"""shift every row 1 step right, converting columns into diagonals. """shift every row 1 step right, converting columns into diagonals.
Example: Example:
chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492, chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
-1.8348, 0.7672, 0.2986, 0.0285, -1.8348, 0.7672, 0.2986, 0.0285,
-0.7584, 0.4206, -0.0405, 0.1599, -0.7584, 0.4206, -0.0405, 0.1599,
2.0514, -1.1600, 0.5372, 0.2629 ] 2.0514, -1.1600, 0.5372, 0.2629 ]
window_overlap = num_rows = 4 window_overlap = num_rows = 4
(pad & diagonilize) => (pad & diagonilize) =>
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000 [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000 0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
""" """
total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states) total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
...@@ -779,7 +784,8 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ...@@ -779,7 +784,8 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
tf.transpose(global_attn_output, (0, 2, 1, 3)), is_local_index_global_attn_nonzero tf.transpose(global_attn_output, (0, 2, 1, 3)), is_local_index_global_attn_nonzero
) )
nonzero_global_attn_output = tf.reshape( nonzero_global_attn_output = tf.reshape(
nonzero_global_attn_output, (shape_list(is_local_index_global_attn_nonzero)[0], -1), nonzero_global_attn_output,
(shape_list(is_local_index_global_attn_nonzero)[0], -1),
) )
# overwrite values with global attention # overwrite values with global attention
...@@ -910,9 +916,9 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -910,9 +916,9 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
self.embeddings.vocab_size = value.shape[0] self.embeddings.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
raise NotImplementedError raise NotImplementedError
...@@ -1021,7 +1027,10 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -1021,7 +1027,10 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
sequence_output = sequence_output[:, :-padding_len] sequence_output = sequence_output[:, :-padding_len]
if not return_dict: if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:] return (
sequence_output,
pooled_output,
) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling( return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
...@@ -1031,7 +1040,13 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -1031,7 +1040,13 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
) )
def _pad_to_window_size( def _pad_to_window_size(
self, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds, pad_token_id, self,
input_ids,
attention_mask,
token_type_ids,
position_ids,
inputs_embeds,
pad_token_id,
): ):
"""A helper function to pad tokens and mask to work with implementation of Longformer selfattention.""" """A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
# padding # padding
...@@ -1083,8 +1098,8 @@ class TFLongformerMainLayer(tf.keras.layers.Layer): ...@@ -1083,8 +1098,8 @@ class TFLongformerMainLayer(tf.keras.layers.Layer):
class TFLongformerPreTrainedModel(TFPreTrainedModel): class TFLongformerPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = LongformerConfig config_class = LongformerConfig
...@@ -1286,7 +1301,10 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel ...@@ -1286,7 +1301,10 @@ class TFLongformerForMaskedLM(TFLongformerPreTrainedModel, TFMaskedLanguageModel
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
......
...@@ -99,8 +99,7 @@ NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm} ...@@ -99,8 +99,7 @@ NORM2FN = {"layer_norm": TFLayerNorm, "no_norm": TFNoNorm}
class TFMobileBertEmbeddings(tf.keras.layers.Layer): class TFMobileBertEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings."""
"""
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -223,10 +222,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): ...@@ -223,10 +222,10 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer):
def _linear(self, inputs): def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer. """Computes logits by running inputs through a linear layer.
Args: Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size] inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns: Returns:
float32 tensor with shape [batch_size, length, vocab_size]. float32 tensor with shape [batch_size, length, vocab_size].
""" """
batch_size = shape_list(inputs)[0] batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1] length = shape_list(inputs)[1]
...@@ -696,9 +695,9 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -696,9 +695,9 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
raise NotImplementedError raise NotImplementedError
...@@ -799,7 +798,10 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -799,7 +798,10 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
pooled_output = self.pooler(sequence_output) pooled_output = self.pooler(sequence_output)
if not return_dict: if not return_dict:
return (sequence_output, pooled_output,) + encoder_outputs[1:] return (
sequence_output,
pooled_output,
) + encoder_outputs[1:]
return TFBaseModelOutputWithPooling( return TFBaseModelOutputWithPooling(
last_hidden_state=sequence_output, last_hidden_state=sequence_output,
...@@ -810,8 +812,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer): ...@@ -810,8 +812,8 @@ class TFMobileBertMainLayer(tf.keras.layers.Layer):
class TFMobileBertPreTrainedModel(TFPreTrainedModel): class TFMobileBertPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = MobileBertConfig config_class = MobileBertConfig
...@@ -967,18 +969,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -967,18 +969,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
@replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=TFMobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
Examples:: Examples::
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased') >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
>>> outputs = model(input_ids) >>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2] >>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
return_dict = kwargs.get("return_dict") return_dict = kwargs.get("return_dict")
...@@ -1069,7 +1071,10 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel ...@@ -1069,7 +1071,10 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModel
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1098,21 +1103,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1098,21 +1103,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
@replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
Examples:: Examples::
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf') >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
>>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
""" """
return_dict = kwargs.get("return_dict") return_dict = kwargs.get("return_dict")
return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict return_dict = return_dict if return_dict is not None else self.mobilebert.return_dict
...@@ -1125,7 +1130,9 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1125,7 +1130,9 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
return (seq_relationship_score,) + outputs[2:] return (seq_relationship_score,) + outputs[2:]
return TFNextSentencePredictorOutput( return TFNextSentencePredictorOutput(
logits=seq_relationship_score, hidden_states=outputs.hidden_states, attentions=outputs.attentions, logits=seq_relationship_score,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1206,7 +1213,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1206,7 +1213,10 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1323,7 +1333,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1323,7 +1333,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
...@@ -1425,7 +1435,10 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1425,7 +1435,10 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -1504,5 +1517,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1504,5 +1517,8 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -243,8 +243,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -243,8 +243,8 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
self.tokens_embed.vocab_size = value.shape[0] self.tokens_embed.vocab_size = value.shape[0]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
""" """
raise NotImplementedError raise NotImplementedError
...@@ -373,13 +373,15 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): ...@@ -373,13 +373,15 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
return TFBaseModelOutput( return TFBaseModelOutput(
last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions, last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_attentions,
) )
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = OpenAIGPTConfig config_class = OpenAIGPTConfig
...@@ -630,31 +632,31 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -630,31 +632,31 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
training=False, training=False,
): ):
r""" r"""
mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) mc_token_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1]``. Selected in the range ``[0, input_ids.size(-1) - 1]``.
Return: Return:
Examples:: Examples::
>>> import tensorflow as tf >>> import tensorflow as tf
>>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
>>> # Add a [CLS] to the vocabulary (we should train it also!) >>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> tokenizer.add_special_tokens({'cls_token': '[CLS]'}) >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
>>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size >>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
>>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary >>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoding = tokenizer(choices, return_tensors="tf") >>> encoding = tokenizer(choices, return_tensors="tf")
>>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()} >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
>>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1 >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
>>> outputs = model(inputs) >>> outputs = model(inputs)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2] >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -28,15 +28,15 @@ logger = logging.get_logger(__name__) ...@@ -28,15 +28,15 @@ logger = logging.get_logger(__name__)
def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""): def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""):
""" Convert a TF 2.0 model variable name in a pytorch model weight name. """Convert a TF 2.0 model variable name in a pytorch model weight name.
Conventions for TF2.0 scopes -> PyTorch attribute names conversions: Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
- '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
- '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
return tuple with: return tuple with:
- pytorch model weight name - pytorch model weight name
- transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
""" """
tf_name = tf_name.replace(":0", "") # device ids tf_name = tf_name.replace(":0", "") # device ids
tf_name = re.sub( tf_name = re.sub(
...@@ -72,8 +72,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="") ...@@ -72,8 +72,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="")
def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False): def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch checkpoints in a TF 2.0 model """Load pytorch checkpoints in a TF 2.0 model"""
"""
try: try:
import tensorflow as tf # noqa: F401 import tensorflow as tf # noqa: F401
import torch # noqa: F401 import torch # noqa: F401
...@@ -96,8 +95,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i ...@@ -96,8 +95,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False): def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch checkpoints in a TF 2.0 model """Load pytorch checkpoints in a TF 2.0 model"""
"""
pt_state_dict = pt_model.state_dict() pt_state_dict = pt_model.state_dict()
return load_pytorch_weights_in_tf2_model( return load_pytorch_weights_in_tf2_model(
...@@ -106,8 +104,7 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi ...@@ -106,8 +104,7 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi
def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False): def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch state_dict in a TF 2.0 model. """Load pytorch state_dict in a TF 2.0 model."""
"""
try: try:
import tensorflow as tf # noqa: F401 import tensorflow as tf # noqa: F401
import torch # noqa: F401 import torch # noqa: F401
...@@ -230,9 +227,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -230,9 +227,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False): def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
""" Load TF 2.0 HDF5 checkpoint in a PyTorch model """Load TF 2.0 HDF5 checkpoint in a PyTorch model
We use HDF5 to easily do transfer learning We use HDF5 to easily do transfer learning
(see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357). (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
""" """
try: try:
import tensorflow as tf # noqa: F401 import tensorflow as tf # noqa: F401
...@@ -265,16 +262,14 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs ...@@ -265,16 +262,14 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False): def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
""" Load TF 2.0 model in a pytorch model """Load TF 2.0 model in a pytorch model"""
"""
weights = tf_model.weights weights = tf_model.weights
return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys) return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False): def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
""" Load TF2.0 symbolic weights in a PyTorch model """Load TF2.0 symbolic weights in a PyTorch model"""
"""
try: try:
import tensorflow as tf # noqa: F401 import tensorflow as tf # noqa: F401
import torch # noqa: F401 import torch # noqa: F401
......
...@@ -73,7 +73,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -73,7 +73,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
self.padding_idx = 1 self.padding_idx = 1
def create_position_ids_from_input_ids(self, x): def create_position_ids_from_input_ids(self, x):
""" Replace non-padding symbols with their position numbers. Position numbers begin at """Replace non-padding symbols with their position numbers. Position numbers begin at
padding_idx+1. Padding symbols are ignored. This is modified from fairseq's padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
`utils.make_positions`. `utils.make_positions`.
:param tf.Tensor x: :param tf.Tensor x:
...@@ -84,7 +84,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -84,7 +84,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
return incremental_indicies + self.padding_idx return incremental_indicies + self.padding_idx
def create_position_ids_from_inputs_embeds(self, inputs_embeds): def create_position_ids_from_inputs_embeds(self, inputs_embeds):
""" We are provided embeddings directly. We cannot infer which are padded so just generate """We are provided embeddings directly. We cannot infer which are padded so just generate
sequential position ids. sequential position ids.
:param tf.Tensor inputs_embeds: :param tf.Tensor inputs_embeds:
:return tf.Tensor: :return tf.Tensor:
...@@ -120,8 +120,8 @@ class TFRobertaMainLayer(TFBertMainLayer): ...@@ -120,8 +120,8 @@ class TFRobertaMainLayer(TFBertMainLayer):
class TFRobertaPreTrainedModel(TFPreTrainedModel): class TFRobertaPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = RobertaConfig config_class = RobertaConfig
...@@ -330,7 +330,10 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos ...@@ -330,7 +330,10 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLos
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMaskedLMOutput( return TFMaskedLMOutput(
loss=loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=prediction_scores,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -431,7 +434,10 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -431,7 +434,10 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFSequenceClassifierOutput( return TFSequenceClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -452,7 +458,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -452,7 +458,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
@property @property
def dummy_inputs(self): def dummy_inputs(self):
""" Dummy inputs to build the network. """Dummy inputs to build the network.
Returns: Returns:
tf.Tensor with dummy inputs tf.Tensor with dummy inputs
...@@ -549,7 +555,10 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -549,7 +555,10 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFMultipleChoiceModelOutput( return TFMultipleChoiceModelOutput(
loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=reshaped_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
...@@ -628,7 +637,10 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -628,7 +637,10 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
return ((loss,) + output) if loss is not None else output return ((loss,) + output) if loss is not None else output
return TFTokenClassifierOutput( return TFTokenClassifierOutput(
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
) )
......
...@@ -67,8 +67,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ ...@@ -67,8 +67,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
class TFT5LayerNorm(tf.keras.layers.Layer): class TFT5LayerNorm(tf.keras.layers.Layer):
def __init__(self, epsilon=1e-6, **kwargs): def __init__(self, epsilon=1e-6, **kwargs):
""" Construct a layernorm module in the T5 style """Construct a layernorm module in the T5 style
No bias and no substraction of mean. No bias and no substraction of mean.
""" """
super().__init__(**kwargs) super().__init__(**kwargs)
self.variance_epsilon = epsilon self.variance_epsilon = epsilon
...@@ -140,7 +140,9 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -140,7 +140,9 @@ class TFT5Attention(tf.keras.layers.Layer):
if self.has_relative_attention_bias: if self.has_relative_attention_bias:
self.relative_attention_bias = tf.keras.layers.Embedding( self.relative_attention_bias = tf.keras.layers.Embedding(
self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias", self.relative_attention_num_buckets,
self.n_heads,
name="relative_attention_bias",
) )
self.pruned_heads = set() self.pruned_heads = set()
...@@ -199,7 +201,9 @@ class TFT5Attention(tf.keras.layers.Layer): ...@@ -199,7 +201,9 @@ class TFT5Attention(tf.keras.layers.Layer):
memory_position = tf.range(klen)[None, :] memory_position = tf.range(klen)[None, :]
relative_position = memory_position - context_position # shape (qlen, klen) relative_position = memory_position - context_position # shape (qlen, klen)
rp_bucket = self._relative_position_bucket( rp_bucket = self._relative_position_bucket(
relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets, relative_position,
bidirectional=not self.is_decoder,
num_buckets=self.relative_attention_num_buckets,
) )
values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads)
values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
...@@ -316,7 +320,9 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): ...@@ -316,7 +320,9 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.SelfAttention = TFT5Attention( self.SelfAttention = TFT5Attention(
config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention", config,
has_relative_attention_bias=has_relative_attention_bias,
name="SelfAttention",
) )
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
...@@ -353,7 +359,9 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): ...@@ -353,7 +359,9 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
def __init__(self, config, has_relative_attention_bias=False, **kwargs): def __init__(self, config, has_relative_attention_bias=False, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.EncDecAttention = TFT5Attention( self.EncDecAttention = TFT5Attention(
config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention", config,
has_relative_attention_bias=has_relative_attention_bias,
name="EncDecAttention",
) )
self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
...@@ -396,12 +404,18 @@ class TFT5Block(tf.keras.layers.Layer): ...@@ -396,12 +404,18 @@ class TFT5Block(tf.keras.layers.Layer):
self.is_decoder = config.is_decoder self.is_decoder = config.is_decoder
self.layer = [] self.layer = []
self.layer.append( self.layer.append(
TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",) TFT5LayerSelfAttention(
config,
has_relative_attention_bias=has_relative_attention_bias,
name="layer_._0",
)
) )
if self.is_decoder: if self.is_decoder:
self.layer.append( self.layer.append(
TFT5LayerCrossAttention( TFT5LayerCrossAttention(
config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1", config,
has_relative_attention_bias=has_relative_attention_bias,
name="layer_._1",
) )
) )
...@@ -490,9 +504,9 @@ class TFT5Block(tf.keras.layers.Layer): ...@@ -490,9 +504,9 @@ class TFT5Block(tf.keras.layers.Layer):
class _NoLayerEmbedTokens: class _NoLayerEmbedTokens:
""" """
this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer' this class wraps a the TFSharedEmbeddingTokens layer into a python 'no-keras-layer'
class to avoid problem with weight restoring. Also it makes sure that the layer is class to avoid problem with weight restoring. Also it makes sure that the layer is
called from the correct scope to avoid problem with saving/storing the correct weights called from the correct scope to avoid problem with saving/storing the correct weights
""" """
def __init__(self, layer, abs_scope_name=None): def __init__(self, layer, abs_scope_name=None):
...@@ -539,7 +553,11 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -539,7 +553,11 @@ class TFT5MainLayer(tf.keras.layers.Layer):
self.num_hidden_layers = config.num_layers self.num_hidden_layers = config.num_layers
self.block = [ self.block = [
TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),) TFT5Block(
config,
has_relative_attention_bias=bool(i == 0),
name="block_._{}".format(i),
)
for i in range(config.num_layers) for i in range(config.num_layers)
] ]
self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm") self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
...@@ -654,7 +672,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -654,7 +672,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
if self.is_decoder: if self.is_decoder:
seq_ids = tf.range(mask_seq_length) seq_ids = tf.range(mask_seq_length)
causal_mask = tf.less_equal( causal_mask = tf.less_equal(
tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), seq_ids[None, :, None], tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
seq_ids[None, :, None],
) )
causal_mask = tf.cast(causal_mask, dtype=tf.float32) causal_mask = tf.cast(causal_mask, dtype=tf.float32)
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
...@@ -765,8 +784,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): ...@@ -765,8 +784,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
# pointers for your model. # pointers for your model.
#################################################### ####################################################
class TFT5PreTrainedModel(TFPreTrainedModel): class TFT5PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = T5Config config_class = T5Config
...@@ -961,17 +980,17 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -961,17 +980,17 @@ class TFT5Model(TFT5PreTrainedModel):
training=False, training=False,
): ):
r""" r"""
Returns: Returns:
Examples:: Examples::
>>> from transformers import T5Tokenizer, TFT5Model >>> from transformers import T5Tokenizer, TFT5Model
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5Model.from_pretrained('t5-small') >>> model = TFT5Model.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs) >>> outputs = model(inputs, decoder_input_ids=inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -1157,26 +1176,26 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling ...@@ -1157,26 +1176,26 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
training=False, training=False,
): ):
r""" r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the cross entropy classification loss. Labels for computing the cross entropy classification loss.
Indices should be in ``[0, ..., config.vocab_size - 1]``. Indices should be in ``[0, ..., config.vocab_size - 1]``.
Returns: Returns:
Examples:: Examples::
>>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small') >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs) >>> outputs = model(inputs, decoder_input_ids=inputs)
>>> prediction_scores = outputs[0] >>> prediction_scores = outputs[0]
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small') >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> result = model.generate(inputs) >>> result = model.generate(inputs)
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -628,7 +628,13 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -628,7 +628,13 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
hids.append(core_out) hids.append(core_out)
mems_i = None if mems is None else mems[i] mems_i = None if mems is None else mems[i]
layer_outputs = layer( layer_outputs = layer(
core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i], output_attentions, training=training, core_out,
pos_emb,
dec_attn_mask,
mems_i,
head_mask[i],
output_attentions,
training=training,
) )
core_out = layer_outputs[0] core_out = layer_outputs[0]
if output_attentions: if output_attentions:
...@@ -657,13 +663,16 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -657,13 +663,16 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
return TFTransfoXLModelOutput( return TFTransfoXLModelOutput(
last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions, last_hidden_state=core_out,
mems=new_mems,
hidden_states=hids,
attentions=attentions,
) )
class TFTransfoXLPreTrainedModel(TFPreTrainedModel): class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = TransfoXLConfig config_class = TransfoXLConfig
...@@ -852,8 +861,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -852,8 +861,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
) )
def get_output_embeddings(self): def get_output_embeddings(self):
""" Double-check if you are using adaptive softmax. """Double-check if you are using adaptive softmax."""
"""
if len(self.crit.out_layers) > 0: if len(self.crit.out_layers) > 0:
return self.crit.out_layers[-1] return self.crit.out_layers[-1]
return None return None
......
...@@ -64,7 +64,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): ...@@ -64,7 +64,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
else: else:
self.out_projs.append(None) self.out_projs.append(None)
weight = self.add_weight( weight = self.add_weight(
shape=(self.vocab_size, self.d_embed,), shape=(
self.vocab_size,
self.d_embed,
),
initializer="zeros", initializer="zeros",
trainable=True, trainable=True,
name="out_layers_._{}_._weight".format(i), name="out_layers_._{}_._weight".format(i),
...@@ -86,7 +89,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): ...@@ -86,7 +89,10 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
) )
self.out_projs.append(weight) self.out_projs.append(weight)
weight = self.add_weight( weight = self.add_weight(
shape=(r_idx - l_idx, d_emb_i,), shape=(
r_idx - l_idx,
d_emb_i,
),
initializer="zeros", initializer="zeros",
trainable=True, trainable=True,
name="out_layers_._{}_._weight".format(i), name="out_layers_._{}_._weight".format(i),
......
...@@ -207,13 +207,12 @@ class TFMultipleChoiceLoss(TFSequenceClassificationLoss): ...@@ -207,13 +207,12 @@ class TFMultipleChoiceLoss(TFSequenceClassificationLoss):
class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss): class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
""" """
Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens. Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
.. note:: .. note::
Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
""" Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
"""
class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment