Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
......@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate
from .activations import gelu, gelu_new, swish
from .configuration_mobilebert import MobileBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
......@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertModel, MobileBertTokenizer
import torch
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertModel.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForPreTraining
import torch
tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
>>> import torch
>>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
>>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
outputs = self.mobilebert(
......@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMaskedLM
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
Examples::
from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
import torch
>>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
>>> import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random
>>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
"""
outputs = self.mobilebert(
......@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.mobilebert(
......@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
"""
outputs = self.mobilebert(
......@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def forward(
self,
input_ids=None,
......@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import MobileBertTokenizer, MobileBertForTokenClassification
import torch
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.mobilebert(
......
......@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu_new, swish
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
Conv1D,
PreTrainedModel,
......@@ -40,6 +40,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
......@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward(
self,
input_ids=None,
......@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
return self.lm_head
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def forward(
self,
input_ids=None,
......@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import torch
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
......
......@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu, gelu_fast, gelu_new, swish
from .configuration_reformer import ReformerConfig
from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
DUMMY_INPUTS,
DUMMY_MASK,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ReformerTokenizer"
REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/reformer-crime-and-punishment",
"google/reformer-enwik8",
......@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward(
self,
input_ids=None,
......@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ReformerModel, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
pass
@add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
def forward(
self,
input_ids=None,
......@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ReformerModelWithLMHead, ReformerTokenizer
import torch
tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
model = ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
reformer_outputs = self.reformer(
......
......@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_roberta import RobertaConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base",
"roberta-large",
......@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
......@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
......@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.roberta(
input_ids,
......@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
......@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
......@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import RobertaTokenizer, RobertaForTokenClassification
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.roberta(
......@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def forward(
self,
input_ids=None,
......@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint roberta-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
outputs = self.roberta(
......
......@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
####################################################
# This dict contrains shortcut names and associated url
# for the pretrained weights provided with the models
......@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Example::
>>> from transformers import T5Tokenizer, T5Model
from transformers import T5Tokenizer, T5Model
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5Model.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
use_cache = use_cache if use_cache is not None else self.config.use_cache
......@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, T5ForConditionalGeneration
>>> from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model.generate(input_ids)
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
>>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1
>>> outputs = model.generate(input_ids)
"""
if "lm_labels" in kwargs:
......
......@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from .configuration_albert import AlbertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
......@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1",
"albert-large-v1",
......@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
self.albert = TFAlbertMainLayer(config, name="albert")
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertModel
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertModel.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.albert(inputs, **kwargs)
return outputs
......@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
return self.albert.embeddings
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMaskedLM
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.albert(inputs, **kwargs)
......@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
......@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
......@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForTokenClassification
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs=None,
......@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
......@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def call(
self,
inputs,
......@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......
......@@ -22,7 +22,12 @@ import numpy as np
import tensorflow as tf
from .configuration_bert import BertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
......@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased",
......@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
self.bert = TFBertMainLayer(config, name="bert")
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.bert(inputs, **kwargs)
return outputs
......@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
return self.bert.embeddings
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.bert(inputs, **kwargs)
......@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
......@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs,
......@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMultipleChoice
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
......@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForTokenClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
def call(
self,
inputs=None,
......@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import BertTokenizer, TFBertForQuestionAnswering
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
......@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFPreTrainedModel,
TFSharedEmbeddings,
......@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CtrlTokenizer"
TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl
......@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
self.transformer = TFCTRLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLModel.from_pretrained('ctrl')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import CTRLTokenizer, TFCTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = TFCTRLLMHeadModel.from_pretrained('ctrl')
input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
outputs = model(input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
......
......@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf
from .configuration_distilbert import DistilBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
......@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased",
......@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.distilbert(inputs, **kwargs)
return outputs
......@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
return self.vocab_projector.input_embeddings
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(self, inputs, **kwargs):
r"""
......@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
distilbert_output = self.distilbert(inputs, **kwargs)
......@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
......@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
......@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
......@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[6] if len(inputs) > 6 else labels
......@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs,
......@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def call(
self,
inputs=None,
......@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[6] if len(inputs) > 6 else start_positions
......
......@@ -4,7 +4,7 @@ import tensorflow as tf
from transformers import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from .modeling_tf_utils import (
TFQuestionAnsweringLoss,
......@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
......@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
self.electra = TFElectraMainLayer(config, name="electra")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraModel
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.electra(inputs, **kwargs)
return outputs
......@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def call(
self,
input_ids=None,
......@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForMaskedLM
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
generator_hidden_states = self.electra(
......@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(
self,
inputs=None,
......@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForTokenClassification
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def call(
self,
inputs=None,
......@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
......@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
......@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2",
"gpt2-medium",
......@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
self.transformer = TFGPT2MainLayer(config, name="transformer")
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
......@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
Examples::
# For example purposes. Not runnable.
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
>>> import tensorflow as tf
>>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
>>> input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
>>> mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
......
......@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from . import MobileBertConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
......@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"mobilebert-uncased",
......@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
return prediction_scores
class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
def call(self, inputs):
sequence_output, pooled_output = inputs
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
@keras_serializable
class TFMobileBertMainLayer(tf.keras.layers.Layer):
config_class = MobileBertConfig
......@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertModel
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.mobilebert(inputs, **kwargs)
return outputs
......@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertPreTrainingHeads(config, name="cls")
self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")
def get_output_embeddings(self):
return self.mobilebert.embeddings
......@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
>>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
outputs = self.mobilebert(inputs, **kwargs)
sequence_output, pooled_output = outputs[:2]
prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output])
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
outputs = (prediction_scores, seq_relationship_score,) + outputs[
2:
] # add hidden states and attention if they are here
......@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
return self.mobilebert.embeddings
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.mobilebert(inputs, **kwargs)
......@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
super().__init__(config, *inputs, **kwargs)
self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
self.cls = TFMobileBertOnlyNSPHead(config, name="cls")
self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
def call(self, inputs, **kwargs):
......@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
>>> import tensorflow as tf
>>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
>>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
>>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random
>>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
"""
outputs = self.mobilebert(inputs, **kwargs)
......@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
......@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
......@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased') # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
assert answer == "a nice puppet"
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
......@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs,
......@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
)
@add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
def call(
self,
inputs=None,
......@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......
......@@ -22,7 +22,7 @@ import numpy as np
import tensorflow as tf
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_utils import (
TFConv1D,
TFPreTrainedModel,
......@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"openai-gpt",
# See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
......@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
return self.transformer.tokens_embed
@add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
......@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
Examples::
# For example purposes. Not runnable.
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
>>> import tensorflow as tf
>>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
>>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices
mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
>>> model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
>>> print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoding = tokenizer(choices, return_tensors="tf")
>>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
>>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :] # Batch size 1
>>> outputs = model(inputs)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
......@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
......
......@@ -21,7 +21,12 @@ import logging
import tensorflow as tf
from .configuration_roberta import RobertaConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
......@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"roberta-base",
"roberta-large",
......@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs):
r"""
Returns:
......@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.roberta(inputs, **kwargs)
return outputs
......@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
return self.lm_head.decoder
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
outputs = self.roberta(inputs, **kwargs)
......@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
self.classifier = TFRobertaClassificationHead(config, name="classifier")
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
......@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.roberta = TFBertMainLayer(config, name="roberta")
self.roberta = TFRobertaMainLayer(config, name="roberta")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
......@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs,
......@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
)
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
......@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForTokenClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[8] if len(inputs) > 8 else labels
......@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
)
@add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
def call(
self,
inputs=None,
......@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint roberta-base is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[8] if len(inputs) > 8 else start_positions
......
......@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "T5Tokenizer"
TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
"t5-small",
"t5-base",
......@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, TFT5Model
>>> from transformers import T5Tokenizer, TFT5Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5Model.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
......@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
Examples::
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
>>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs)
prediction_scores = outputs[0]
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> outputs = model(inputs, decoder_input_ids=inputs)
>>> prediction_scores = outputs[0]
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
model.generate(inputs)
>>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
>>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
>>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf") # Batch size 1
>>> result = model.generate(inputs)
"""
......
......@@ -22,7 +22,7 @@ import logging
import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .modeling_tf_utils import (
TFPreTrainedModel,
......@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
......@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
self.transformer = TFTransfoXLMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def call(
self,
inputs,
......@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
labels = inputs[4] if len(inputs) > 4 else labels
output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
elif isinstance(inputs, (BatchEncoding, dict)):
input_ids = inputs.get("input_ids")
mems = inputs.get("mems", mems)
head_mask = inputs.get("head_mask", head_mask)
......
......@@ -24,7 +24,12 @@ import numpy as np
import tensorflow as tf
from .configuration_xlm import XLMConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
......@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLMTokenizer"
TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlm-mlm-en-2048",
"xlm-mlm-ende-1024",
......@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
return {"inputs": inputs, "langs": langs}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
transformer_outputs = self.transformer(inputs, **kwargs)
......@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs=None,
......@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForSequenceClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels
......@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs,
......@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
......@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForMultipleChoice
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs[7] if len(inputs) > 7 else head_mask
inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
assert len(inputs) <= 10, "Too many inputs."
output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
labels = inputs[11] if len(inputs) > 11 else labels
assert len(inputs) <= 11, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
......@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
output_attentions = inputs.get("output_attentions", output_attentions)
assert len(inputs) <= 10, "Too many inputs."
output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
labels = inputs.get("labels", labels)
assert len(inputs) <= 12, "Too many inputs."
else:
input_ids = inputs
......@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
self.transformer = TFXLMMainLayer(config, name="transformer")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.classifier = tf.keras.layers.Dense(
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
input_ids=None,
inputs=None,
attention_mask=None,
langs=None,
token_type_ids=None,
......@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
cache=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
labels=None,
training=False,
):
r"""
......@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForTokenClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[11] if len(inputs) > 11 else labels
if len(inputs) > 11:
inputs = inputs[:11]
elif isinstance(inputs, (dict, BatchEncoding)):
labels = inputs.pop("labels", labels)
transformer_outputs = self.transformer(
input_ids,
inputs,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
......@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
sequence_output = self.dropout(sequence_output, training=training)
logits = self.classifier(sequence_output)
outputs = (logits,) + transformer_outputs[2:] # add hidden states and attention if they are here
outputs = (logits,) + transformer_outputs[1:] # add hidden states and attention if they are here
if labels is not None:
loss = self.compute_loss(labels, logits)
......@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
)
@add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
def call(
self,
inputs=None,
......@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[11] if len(inputs) > 11 else start_positions
......
......@@ -23,7 +23,12 @@ import numpy as np
import tensorflow as tf
from .configuration_xlnet import XLNetConfig
from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
MULTIPLE_CHOICE_DUMMY_INPUTS,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_tf_utils import (
TFMultipleChoiceLoss,
TFPreTrainedModel,
......@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "XLNetTokenizer"
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
"xlnet-base-cased",
"xlnet-large-cased",
......@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name="transformer")
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(self, inputs, **kwargs):
r"""
Return:
......@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetModel.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.transformer(inputs, **kwargs)
return outputs
......@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
......@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
......@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
......@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
......@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
labels = tf.reshape(tf.constant(1), (-1, 1))
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
......@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
......@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
if isinstance(inputs, (tuple, list)):
labels = inputs[12] if len(inputs) > 12 else labels
......@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
)
@add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
def call(
self,
inputs=None,
......@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
if isinstance(inputs, (tuple, list)):
start_positions = inputs[12] if len(inputs) > 12 else start_positions
......
......@@ -27,13 +27,15 @@ import torch.nn as nn
import torch.nn.functional as F
from .configuration_transfo_xl import TransfoXLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"transfo-xl-wt103",
# See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
......@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward(
self,
input_ids=None,
......@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(bsz)
@add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
def forward(
self,
input_ids=None,
......@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
import torch
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
"""
if input_ids is not None:
bsz, tgt_len = input_ids.size(0), input_ids.size(1)
......
......@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
Examples::
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
outputs = model.generate(max_length=40) # do greedy decoding
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache.
input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
......@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache.
input_context = 'The dog'
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling
outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # 3 generate sequences using by sampling
for i in range(3): # 3 output sequences were generated
print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('ctrl') # Download model and configuration from S3 and cache.
input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences
print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer
model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
model = AutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache.
input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl
bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment