Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
...@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate ...@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate
from .activations import gelu, gelu_new, swish from .activations import gelu, gelu_new, swish
from .configuration_mobilebert import MobileBertConfig from .configuration_mobilebert import MobileBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"] MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
...@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel): ...@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import MobileBertModel, MobileBertTokenizer
import torch
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertModel.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): ...@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
heads. heads.
Examples:: Examples::
from transformers import MobileBertTokenizer, MobileBertForPreTraining
import torch >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path) >>> import torch
model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
outputs = model(input_ids) >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
prediction_scores, seq_relationship_scores = outputs[:2]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
outputs = self.mobilebert( outputs = self.mobilebert(
...@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): ...@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): ...@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMaskedLM
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): ...@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
Examples:: Examples::
from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
import torch >>> import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased') >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased') >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt') >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random
""" """
outputs = self.mobilebert( outputs = self.mobilebert(
...@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): ...@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.mobilebert( outputs = self.mobilebert(
...@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): ...@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): ...@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
""" """
outputs = self.mobilebert( outputs = self.mobilebert(
...@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): ...@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
...@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): ...@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForTokenClassification
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.mobilebert( outputs = self.mobilebert(
......
...@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss ...@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu_new, swish from .activations import gelu_new, swish
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import ( from .modeling_utils import (
Conv1D, Conv1D,
PreTrainedModel, PreTrainedModel,
...@@ -40,6 +40,8 @@ from .modeling_utils import ( ...@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt", "openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
...@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
return self.lm_head return self.lm_head
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
......
...@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss ...@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu, gelu_fast, gelu_new, swish from .activations import gelu, gelu_fast, gelu_new, swish
from .configuration_reformer import ReformerConfig from .configuration_reformer import ReformerConfig
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ReformerTokenizer"
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/reformer-crime-and-punishment", "google/reformer-crime-and-punishment",
"google/reformer-enwik8", "google/reformer-enwik8",
...@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel): ...@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ReformerModel, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
pass pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): ...@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ReformerModelWithLMHead, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
reformer_outputs = self.reformer( reformer_outputs = self.reformer(
......
...@@ -24,12 +24,14 @@ import torch.nn as nn ...@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base", "roberta-base",
"roberta-large", "roberta-large",
...@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
return self.lm_head.decoder return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel): ...@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel): ...@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
...@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel): ...@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
...@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel): ...@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.roberta( outputs = self.roberta(
...@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): ...@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
# The checkpoint roberta-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
""" """
outputs = self.roberta( outputs = self.roberta(
......
...@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p ...@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
#################################################### ####################################################
# This dict contrains shortcut names and associated url # This dict contrains shortcut names and associated url
# for the pretrained weights provided with the models # for the pretrained weights provided with the models
...@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel): ...@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Example::
>>> from transformers import T5Tokenizer, T5Model
from transformers import T5Tokenizer, T5Model >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5Model.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
model = T5Model.from_pretrained('t5-small') >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
...@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel): ...@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
Examples:: Examples::
from transformers import T5Tokenizer, T5ForConditionalGeneration >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small') >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] >>> loss, prediction_scores = outputs[:2]
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small') >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model.generate(input_ids) >>> outputs = model.generate(input_ids)
""" """
if "lm_labels" in kwargs: if "lm_labels" in kwargs:
......
...@@ -21,7 +21,12 @@ import logging ...@@ -21,7 +21,12 @@ import logging
import tensorflow as tf import tensorflow as tf
from .configuration_albert import AlbertConfig from .configuration_albert import AlbertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding ...@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1", "albert-base-v1",
"albert-large-v1", "albert-large-v1",
...@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
self.albert = TFAlbertMainLayer(config, name="albert") self.albert = TFAlbertMainLayer(config, name="albert")
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel): ...@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertModel.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.albert(inputs, **kwargs) outputs = self.albert(inputs, **kwargs)
return outputs return outputs
...@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
return self.albert.embeddings return self.albert.embeddings
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): ...@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
outputs = self.albert(inputs, **kwargs) outputs = self.albert(inputs, **kwargs)
...@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass ...@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat ...@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForTokenClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
) )
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL ...@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[8] if len(inputs) > 8 else start_positions
...@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call( def call(
self, self,
inputs, inputs,
...@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
......
...@@ -22,7 +22,12 @@ import numpy as np ...@@ -22,7 +22,12 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_bert import BertConfig from .configuration_bert import BertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding ...@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased", "bert-base-uncased",
...@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
self.bert = TFBertMainLayer(config, name="bert") self.bert = TFBertMainLayer(config, name="bert")
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel): ...@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
return outputs return outputs
...@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
return self.bert.embeddings return self.bert.embeddings
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): ...@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
...@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific ...@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call( def call(
self, self,
inputs, inputs,
...@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMultipleChoice
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL ...@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
) )
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) ...@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
TFSharedEmbeddings, TFSharedEmbeddings,
...@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding ...@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CtrlTokenizer"
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl" "ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl # See all CTRL models at https://huggingface.co/models?filter=ctrl
...@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer = TFCTRLMainLayer(config, name="transformer") self.transformer = TFCTRLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel): ...@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLModel.from_pretrained('ctrl')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): ...@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLLMHeadModel.from_pretrained('ctrl')
input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
outputs = model(input_ids)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
......
...@@ -23,7 +23,12 @@ import numpy as np ...@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding ...@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased", "distilbert-base-uncased",
...@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): ...@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.distilbert(inputs, **kwargs) outputs = self.distilbert(inputs, **kwargs)
return outputs return outputs
...@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
return self.vocab_projector.input_embeddings return self.vocab_projector.input_embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
...@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): ...@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
distilbert_output = self.distilbert(inputs, **kwargs) distilbert_output = self.distilbert(inputs, **kwargs)
...@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque ...@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels labels = inputs[6] if len(inputs) > 6 else labels
...@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
) )
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla ...@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels labels = inputs[6] if len(inputs) > 6 else labels
...@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call( def call(
self, self,
inputs, inputs,
...@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic ...@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
self.dropout = tf.keras.layers.Dropout(config.qa_dropout) self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn ...@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[6] if len(inputs) > 6 else start_positions start_positions = inputs[6] if len(inputs) > 6 else start_positions
......
...@@ -4,7 +4,7 @@ import tensorflow as tf ...@@ -4,7 +4,7 @@ import tensorflow as tf
from transformers import ElectraConfig from transformers import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFQuestionAnsweringLoss, TFQuestionAnsweringLoss,
...@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding ...@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator", "google/electra-small-generator",
...@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel): ...@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
self.electra = TFElectraMainLayer(config, name="electra") self.electra = TFElectraMainLayer(config, name="electra")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel): ...@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraModel
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.electra(inputs, **kwargs) outputs = self.electra(inputs, **kwargs)
return outputs return outputs
...@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
return self.generator_lm_head return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def call( def call(
self, self,
input_ids=None, input_ids=None,
...@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): ...@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForMaskedLM
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
generator_hidden_states = self.electra( generator_hidden_states = self.electra(
...@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
) )
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific ...@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForTokenClassification
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
) )
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin ...@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding ...@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2", "gpt2",
"gpt2-medium", "gpt2-medium",
...@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name="transformer") self.transformer = TFGPT2MainLayer(config, name="transformer")
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel): ...@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]} return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): ...@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): ...@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
Examples:: Examples::
# For example purposes. Not runnable. >>> import tensorflow as tf
import tensorflow as tf >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2') >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
# Add a [CLS] to the vocabulary (we should train it also!) >>> # Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0 >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices] >>> encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2 >>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1 >>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids) >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2] >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
......
...@@ -21,7 +21,12 @@ import logging ...@@ -21,7 +21,12 @@ import logging
import tensorflow as tf import tensorflow as tf
from . import MobileBertConfig from . import MobileBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding ...@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"mobilebert-uncased", "mobilebert-uncased",
...@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer): ...@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
return prediction_scores return prediction_scores
class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
def call(self, inputs):
sequence_output, pooled_output = inputs
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
@keras_serializable @keras_serializable
class TFMobileBertMainLayer(tf.keras.layers.Layer): class TFMobileBertMainLayer(tf.keras.layers.Layer):
config_class = MobileBertConfig config_class = MobileBertConfig
...@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): ...@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): ...@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertModel
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
return outputs return outputs
...@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertPreTrainingHeads(config, name="cls") self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")
def get_output_embeddings(self): def get_output_embeddings(self):
return self.mobilebert.embeddings return self.mobilebert.embeddings
...@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): ...@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
Examples:: Examples::
import tensorflow as tf >>> import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForPreTraining >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased') >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased') >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids) >>> outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2] >>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output]) prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[ outputs = (prediction_scores, seq_relationship_score,) + outputs[
2: 2:
] # add hidden states and attention if they are here ] # add hidden states and attention if they are here
...@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
return self.mobilebert.embeddings return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): ...@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
...@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert") self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertOnlyNSPHead(config, name="cls") self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
...@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): ...@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
Examples:: Examples::
import tensorflow as tf >>> import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased') >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased') >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf') >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random
""" """
outputs = self.mobilebert(inputs, **kwargs) outputs = self.mobilebert(inputs, **kwargs)
...@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque ...@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn ...@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased') # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[8] if len(inputs) > 8 else start_positions
...@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call( def call(
self, self,
inputs, inputs,
...@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic ...@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
) )
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla ...@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFConv1D, TFConv1D,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding ...@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt", "openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
...@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): ...@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): ...@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
...@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Examples:: Examples::
# For example purposes. Not runnable. >>> import tensorflow as tf
import tensorflow as tf >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# Add a [CLS] to the vocabulary (we should train it also!) >>> # Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0 >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
raise NotImplementedError >>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) >>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices
mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoding = tokenizer(choices, return_tensors="tf")
>>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
>>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
>>> outputs = model(inputs)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
...@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): ...@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs." assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids) token_type_ids = inputs.get("token_type_ids", token_type_ids)
......
...@@ -21,7 +21,12 @@ import logging ...@@ -21,7 +21,12 @@ import logging
import tensorflow as tf import tensorflow as tf
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
...@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding ...@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base", "roberta-base",
"roberta-large", "roberta-large",
...@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): ...@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Returns: Returns:
...@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel): ...@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
return outputs return outputs
...@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
return self.lm_head.decoder return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
""" """
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
...@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
self.classifier = TFRobertaClassificationHead(config, name="classifier") self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla ...@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs) super().__init__(config, *inputs, **kwargs)
self.roberta = TFBertMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
...@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call( def call(
self, self,
inputs, inputs,
...@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) ...@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific ...@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels labels = inputs[8] if len(inputs) > 8 else labels
...@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
) )
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin ...@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
# The checkpoint roberta-base is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
...@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding ...@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"t5-small", "t5-small",
"t5-base", "t5-base",
...@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel): ...@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):
Examples:: Examples::
from transformers import T5Tokenizer, TFT5Model >>> from transformers import T5Tokenizer, TFT5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small') >>> model = TFT5Model.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs) >>> outputs = model(inputs, decoder_input_ids=inputs)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
...@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): ...@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
Examples:: Examples::
from transformers import T5Tokenizer, TFT5ForConditionalGeneration >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small') >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs) >>> outputs = model(inputs, decoder_input_ids=inputs)
prediction_scores = outputs[0] >>> prediction_scores = outputs[0]
tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small') >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1 >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
model.generate(inputs) >>> result = model.generate(inputs)
""" """
......
...@@ -22,7 +22,7 @@ import logging ...@@ -22,7 +22,7 @@ import logging
import tensorflow as tf import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFPreTrainedModel, TFPreTrainedModel,
...@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding ...@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103", "transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
...@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): ...@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz) return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call( def call(
self, self,
inputs, inputs,
...@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
labels = inputs[4] if len(inputs) > 4 else labels labels = inputs[4] if len(inputs) > 4 else labels
output_attentions = inputs[5] if len(inputs) > 5 else output_attentions output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
assert len(inputs) <= 6, "Too many inputs." assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, (BatchEncoding, dict)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems) mems = inputs.get("mems", mems)
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
......
...@@ -24,7 +24,12 @@ import numpy as np ...@@ -24,7 +24,12 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlm import XLMConfig from .configuration_xlm import XLMConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding ...@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048", "xlm-mlm-en-2048",
"xlm-mlm-ende-1024", "xlm-mlm-ende-1024",
...@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel): ...@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel): ...@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
return {"inputs": inputs, "langs": langs} return {"inputs": inputs, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
...@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat ...@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForSequenceClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels labels = inputs[11] if len(inputs) > 11 else labels
...@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call( def call(
self, self,
inputs, inputs,
...@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
cache=None, cache=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
...@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForMultipleChoice
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs[7] if len(inputs) > 7 else head_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
labels = inputs[11] if len(inputs) > 11 else labels
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)): elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids") input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask) attention_mask = inputs.get("attention_mask", attention_mask)
...@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): ...@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs.get("head_mask", head_mask) head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions) output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 10, "Too many inputs." output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
labels = inputs.get("labels", labels)
assert len(inputs) <= 12, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
...@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
self.transformer = TFXLMMainLayer(config, name="transformer") self.transformer = TFXLMMainLayer(config, name="transformer")
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
self.classifier = tf.keras.layers.Dense( self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call( def call(
self, self,
input_ids=None, inputs=None,
attention_mask=None, attention_mask=None,
langs=None, langs=None,
token_type_ids=None, token_type_ids=None,
...@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
cache=None, cache=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
labels=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
labels=None,
training=False, training=False,
): ):
r""" r"""
...@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForTokenClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 11:
inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, inputs,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
position_ids=position_ids, position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask, head_mask=head_mask,
inputs_embeds=inputs_embeds, inputs_embeds=inputs_embeds,
output_attentions=output_attentions, output_attentions=output_attentions,
...@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos ...@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training) sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output) logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[2:] # add hidden states and attention if they are here outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None: if labels is not None:
loss = self.compute_loss(labels, logits) loss = self.compute_loss(labels, logits)
...@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
) )
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL ...@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[11] if len(inputs) > 11 else start_positions start_positions = inputs[11] if len(inputs) > 11 else start_positions
......
...@@ -23,7 +23,12 @@ import numpy as np ...@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_xlnet import XLNetConfig from .configuration_xlnet import XLNetConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import ( from .modeling_tf_utils import (
TFMultipleChoiceLoss, TFMultipleChoiceLoss,
TFPreTrainedModel, TFPreTrainedModel,
...@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding ...@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [ TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased", "xlnet-base-cased",
"xlnet-large-cased", "xlnet-large-cased",
...@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name="transformer") self.transformer = TFXLNetMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
r""" r"""
Return: Return:
...@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel): ...@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetModel.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.transformer(inputs, **kwargs) outputs = self.transformer(inputs, **kwargs)
return outputs return outputs
...@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
# We show how to setup inputs to predict a next token using a bi-directional context. # We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1])) perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32)) outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
...@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif ...@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[12] if len(inputs) > 12 else labels
...@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): ...@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio ...@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels labels = inputs[12] if len(inputs) > 12 else labels
...@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
) )
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call( def call(
self, self,
inputs=None, inputs=None,
...@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer ...@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
""" """
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
start_positions = inputs[12] if len(inputs) > 12 else start_positions start_positions = inputs[12] if len(inputs) > 12 else start_positions
......
...@@ -27,13 +27,15 @@ import torch.nn as nn ...@@ -27,13 +27,15 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from .configuration_transfo_xl import TransfoXLConfig from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
from .modeling_utils import PreTrainedModel from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [ TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103", "transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
...@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems return new_mems
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz) return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
""" """
if input_ids is not None: if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1) bsz, tgt_len = input_ids.size(0), input_ids.size(1)
......
...@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin): ...@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
Examples:: Examples::
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
outputs = model.generate(max_length=40) # do greedy decoding outputs = model.generate(max_length=40) # do greedy decoding
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. model = AutoModelForCausalLM.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache.
input_context = 'The dog' input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
...@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin): ...@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
input_context = 'The dog' input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # 3 generate sequences using by sampling
for i in range(3): # 3 output sequences were generated for i in range(3): # 3 output sequences were generated
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. model = AutoModelForCausalLM.from_pretrained('ctrl') # Download model and configuration from S3 and cache.
input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. model = AutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl
bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment