Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
...@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig): ...@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):
Example:: Example::
from transformers import ReformerModel, ReformerConfig >>> from transformers import ReformerModel, ReformerConfig
# Initializing a Reformer configuration >>> # Initializing a Reformer configuration
configuration = ReformerConfig() >>> configuration = ReformerConfig()
# Initializing a Reformer model >>> # Initializing a Reformer model
model = ReformerModel(configuration) >>> model = ReformerModel(configuration)
# Accessing the model configuration >>> # Accessing the model configuration
configuration = model.config >>> configuration = model.config
""" """
model_type = "reformer" model_type = "reformer"
......
...@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig): ...@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):
Example:: Example::
from transformers import RobertaConfig, RobertaModel >>> from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration >>> # Initializing a RoBERTa configuration
configuration = RobertaConfig() >>> configuration = RobertaConfig()
# Initializing a model from the configuration >>> # Initializing a model from the configuration
model = RobertaModel(configuration) >>> model = RobertaModel(configuration)
# Accessing the model configuration >>> # Accessing the model configuration
configuration = model.config >>> configuration = model.config
""" """
model_type = "roberta" model_type = "roberta"
......
...@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):
Example:: Example::
from transformers import TransfoXLConfig, TransfoXLModel >>> from transformers import TransfoXLConfig, TransfoXLModel
# Initializing a Transformer XL configuration >>> # Initializing a Transformer XL configuration
configuration = TransfoXLConfig() >>> configuration = TransfoXLConfig()
# Initializing a model from the configuration >>> # Initializing a model from the configuration
model = TransfoXLModel(configuration) >>> model = TransfoXLModel(configuration)
# Accessing the model configuration >>> # Accessing the model configuration
configuration = model.config >>> configuration = model.config
""" """
model_type = "transfo-xl" model_type = "transfo-xl"
......
...@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig): ...@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):
Example:: Example::
from transformers import XLMConfig, XLMModel >>> from transformers import XLMConfig, XLMModel
# Initializing a XLM configuration >>> # Initializing a XLM configuration
configuration = XLMConfig() >>> configuration = XLMConfig()
# Initializing a model from the configuration >>> # Initializing a model from the configuration
model = XLMModel(configuration) >>> model = XLMModel(configuration)
# Accessing the model configuration >>> # Accessing the model configuration
configuration = model.config >>> configuration = model.config
""" """
model_type = "xlm" model_type = "xlm"
......
...@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig): ...@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):
Example:: Example::
from transformers import XLNetConfig, XLNetModel >>> from transformers import XLNetConfig, XLNetModel
# Initializing a XLNet configuration >>> # Initializing a XLNet configuration
configuration = XLNetConfig() >>> configuration = XLNetConfig()
# Initializing a model from the configuration >>> # Initializing a model from the configuration
model = XLNetModel(configuration) >>> model = XLNetModel(configuration)
# Accessing the model configuration >>> # Accessing the model configuration
configuration = model.config >>> configuration = model.config
""" """
model_type = "xlnet" model_type = "xlnet"
......
...@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor): ...@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):
Examples:: Examples::
import tensorflow_datasets as tfds >>> import tensorflow_datasets as tfds
dataset = tfds.load("squad") >>> dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False) >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
""" """
if evaluate: if evaluate:
......
...@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr): ...@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
return docstring_decorator return docstring_decorator
PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, scores = outputs[:2]
"""
PT_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:3]
"""
PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, logits = outputs[:2]
"""
PT_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
>>> outputs = model(input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
"""
PT_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
PT_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> loss, logits = outputs[:2]
"""
PT_CAUSAL_LM_SAMPLE = r"""
Example::
>>> import torch
>>> from transformers import {tokenizer_class}, {model_class}
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> loss, logits = outputs[:2]
"""
TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> input_ids = inputs["input_ids"]
>>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
>>> outputs = model(inputs)
>>> loss, scores = outputs[:2]
"""
TF_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> input_dict = tokenizer(question, text, return_tensors='tf')
>>> start_scores, end_scores = model(input_dict)
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
>>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
>>> outputs = model(inputs)
>>> loss, logits = outputs[:2]
"""
TF_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores = outputs[0]
"""
TF_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
TF_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> logits = outputs[0]
"""
TF_CAUSAL_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> logits = outputs[0]
"""
def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
def docstring_decorator(fn):
model_class = fn.__qualname__.split(".")[0]
is_tf_class = model_class[:2] == "TF"
if "SequenceClassification" in model_class:
code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
elif "QuestionAnswering" in model_class:
code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
elif "TokenClassification" in model_class:
code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
elif "MultipleChoice" in model_class:
code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
elif "MaskedLM" in model_class:
code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
elif "LMHead" in model_class:
code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
elif "Model" in model_class:
code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
else:
raise ValueError(f"Docstring can't be built for model {model_class}")
built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
return fn
return docstring_decorator
def is_remote_url(url_or_filename): def is_remote_url(url_or_filename):
parsed = urlparse(url_or_filename) parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https") return parsed.scheme in ("http", "https")
......
...@@ -24,13 +24,15 @@ import torch.nn as nn ...@@ -24,13 +24,15 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_albert import AlbertConfig from .configuration_albert import AlbertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1", "albert-base-v1",
...@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel): ...@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Example::
from transformers import AlbertModel, AlbertTokenizer
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel): ...@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
Examples:: Examples::
from transformers import AlbertTokenizer, AlbertForPreTraining >>> from transformers import AlbertTokenizer, AlbertForPreTraining
import torch >>> import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForPreTraining.from_pretrained('albert-base-v2') >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids) >>> outputs = model(input_ids)
prediction_scores, sop_scores = outputs[:2] >>> prediction_scores, sop_scores = outputs[:2]
""" """
...@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
return self.predictions.decoder return self.predictions.decoder
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): ...@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Example::
from transformers import AlbertTokenizer, AlbertForMaskedLM
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): ...@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.albert( outputs = self.albert(
...@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): ...@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import AlbertTokenizer, AlbertForTokenClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.albert( outputs = self.albert(
...@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): ...@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
start_scores, end_scores = model(**input_dict)
""" """
outputs = self.albert( outputs = self.albert(
...@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): ...@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import AlbertTokenizer, AlbertForMultipleChoice
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
...@@ -392,8 +392,8 @@ class AutoModel: ...@@ -392,8 +392,8 @@ class AutoModel:
Examples:: Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. >>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` >>> model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
""" """
for config_class, model_class in MODEL_MAPPING.items(): for config_class, model_class in MODEL_MAPPING.items():
if isinstance(config, config_class): if isinstance(config, config_class):
...@@ -480,8 +480,7 @@ class AutoModel: ...@@ -480,8 +480,7 @@ class AutoModel:
Examples:: Examples::
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` assert model.config.output_attentions == True
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower) # Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
...@@ -547,8 +546,8 @@ class AutoModelForPreTraining: ...@@ -547,8 +546,8 @@ class AutoModelForPreTraining:
Examples:: Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. >>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` >>> model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
""" """
for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items(): for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
if isinstance(config, config_class): if isinstance(config, config_class):
......
...@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss ...@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN from .activations import ACT2FN
from .configuration_bart import BartConfig from .configuration_bart import BartConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BartTokenizer"
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [ BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/bart-large", "facebook/bart-large",
...@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r""" ...@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""
""" """
BART_GENERATION_EXAMPLE = r""" BART_GENERATION_EXAMPLE = r"""
Examples:: Summarization example::
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
# see ``examples/summarization/bart/run_eval.py`` for a longer example # see ``examples/summarization/bart/run_eval.py`` for a longer example
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
# Generate Summary # Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
...@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel): ...@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward( def forward(
self, self,
input_ids, input_ids,
...@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel): ...@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):
@add_start_docstrings( @add_start_docstrings(
"The BART Model with a language modeling head. Can be used for summarization.", "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
) )
class BartForConditionalGeneration(PretrainedBartModel): class BartForConditionalGeneration(PretrainedBartModel):
base_model_prefix = "model" base_model_prefix = "model"
...@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel): ...@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
self.register_buffer("final_logits_bias", new_bias) self.register_buffer("final_logits_bias", new_bias)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def forward( def forward(
self, self,
input_ids, input_ids,
...@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel): ...@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Conditional generation example::
# Mask filling only works for bart-large # Mask filling only works for bart-large
from transformers import BartTokenizer, BartForConditionalGeneration from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('bart-large') tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
TXT = "My friends are <mask> but they eat too many carbs." TXT = "My friends are <mask> but they eat too many carbs."
model = BartForConditionalGeneration.from_pretrained('bart-large')
input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids'] model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
logits = model(input_ids)[0] logits = model(input_ids)[0]
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0) probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5) values, predictions = probs.topk(5)
tokenizer.decode(predictions).split() tokenizer.decode(predictions).split()
# ['good', 'great', 'all', 'really', 'very'] # ['good', 'great', 'all', 'really', 'very']
""" """
...@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel): ...@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
self.model._init_weights(self.classification_head.out_proj) self.model._init_weights(self.classification_head.out_proj)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward( def forward(
self, self,
input_ids, input_ids,
...@@ -1092,28 +1107,15 @@ class BartForSequenceClassification(PretrainedBartModel): ...@@ -1092,28 +1107,15 @@ class BartForSequenceClassification(PretrainedBartModel):
Classification loss (cross entropy) Classification loss (cross entropy)
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the
self-attention
heads. heads.
Examples::
from transformers import BartTokenizer, BartForSequenceClassification
import torch
tokenizer = BartTokenizer.from_pretrained('bart-large')
model = BartForSequenceClassification.from_pretrained('bart-large')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if labels is not None: if labels is not None:
use_cache = False use_cache = False
...@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
self.model._init_weights(self.qa_outputs) self.model._init_weights(self.qa_outputs)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward( def forward(
self, self,
input_ids, input_ids,
...@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
# The checkpoint bart-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import BartTokenizer, BartForQuestionAnswering
import torch
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
""" """
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
use_cache = False use_cache = False
...@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel): ...@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
total_loss = (start_loss + end_loss) / 2 total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) return outputs # return outputs # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)
class SinusoidalPositionalEmbedding(nn.Embedding): class SinusoidalPositionalEmbedding(nn.Embedding):
......
...@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss ...@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import gelu, gelu_new, swish from .activations import gelu, gelu_new, swish
from .configuration_bert import BertConfig from .configuration_bert import BertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased", "bert-base-uncased",
"bert-large-uncased", "bert-large-uncased",
...@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel): ...@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel): ...@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertModel, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):
Examples:: Examples::
from transformers import BertTokenizer, BertForPreTraining >>> from transformers import BertTokenizer, BertForPreTraining
import torch >>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased') >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(input_ids) >>> outputs = model(**inputs)
prediction_scores, seq_relationship_scores = outputs[:2] >>> prediction_scores, seq_relationship_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
...@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel): ...@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Example::
from transformers import BertTokenizer, BertLMHeadModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True) >>> import torch
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
outputs = model(input_ids, labels=input_ids) >>> config = BertConfig.from_pretrained("bert-base-cased")
>>> config.is_decoder = True
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
loss, prediction_scores = outputs[:2] >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
outputs = self.bert( outputs = self.bert(
...@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
return self.cls.predictions.decoder return self.cls.predictions.decoder
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForMaskedLM
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
Examples:: Examples::
from transformers import BertTokenizer, BertForNextSentencePrediction >>> from transformers import BertTokenizer, BertForNextSentencePrediction
import torch >>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt') >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
""" """
outputs = self.bert( outputs = self.bert(
...@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
outputs = self.bert( outputs = self.bert(
...@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForMultipleChoice
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
...@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForTokenClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.bert( outputs = self.bert(
...@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
""" """
outputs = self.bert( outputs = self.bert(
......
...@@ -31,6 +31,8 @@ from .modeling_roberta import ( ...@@ -31,6 +31,8 @@ from .modeling_roberta import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CamembertTokenizer"
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"camembert-base", "camembert-base",
"Musixmatch/umberto-commoncrawl-cased-v1", "Musixmatch/umberto-commoncrawl-cased-v1",
......
...@@ -24,12 +24,14 @@ import torch.nn as nn ...@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from .configuration_ctrl import CTRLConfig from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CTRLTokenizer"
CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [ CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl" "ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl # See all CTRL models at https://huggingface.co/models?filter=ctrl
...@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
self.h[layer].multi_head_attention.prune_heads(heads) self.h[layer].multi_head_attention.prune_heads(heads)
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel): ...@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import CTRLTokenizer, CTRLModel
import torch
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
use_cache = use_cache if use_cache is not None else self.config.use_cache use_cache = use_cache if use_cache is not None else self.config.use_cache
...@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): ...@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
......
...@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss ...@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu from .activations import gelu
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased", "distilbert-base-uncased",
...@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.transformer.layer[layer].attention.prune_heads(heads) self.transformer.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertModel
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return self.vocab_projector return self.vocab_projector
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
distilbert_output = self.distilbert( distilbert_output = self.distilbert(
input_ids=input_ids, input_ids=input_ids,
...@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:3]
""" """
distilbert_output = self.distilbert( distilbert_output = self.distilbert(
input_ids=input_ids, input_ids=input_ids,
...@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): ...@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.distilbert( outputs = self.distilbert(
...@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
Examples:: Examples::
from transformers import DistilBertTokenizer, DistilBertForMultipleChoice >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
import torch >>> import torch
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') >>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
choice0 = "It is eaten with a fork and a knife." >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) >>> # the linear classifier still needs to be trained
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> loss, logits = outputs[:2]
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
...@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss ...@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import get_activation from .activations import get_activation
from .configuration_electra import ElectraConfig from .configuration_electra import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
from .modeling_utils import SequenceSummary from .modeling_utils import SequenceSummary
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator", "google/electra-small-generator",
...@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel): ...@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ElectraModel, ElectraTokenizer
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): ...@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
input_ids, input_ids,
...@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel): ...@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
Examples:: Examples::
from transformers import ElectraTokenizer, ElectraForPreTraining >>> from transformers import ElectraTokenizer, ElectraForPreTraining
import torch >>> import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator') >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator') >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids) >>> scores = model(input_ids)[0]
prediction_scores, seq_relationship_scores = outputs[:2]
""" """
...@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
return self.generator_lm_head return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): ...@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMaskedLM
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
warnings.warn( warnings.warn(
...@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): ...@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ElectraTokenizer, ElectraForTokenClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
...@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): ...@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
""" """
discriminator_hidden_states = self.electra( discriminator_hidden_states = self.electra(
...@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): ...@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): ...@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMultipleChoice
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
...@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel): ...@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):
Examples:: Examples::
from transformers import EncoderDecoder >>> from transformers import EncoderDecoderModel
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
""" """
kwargs_encoder = { kwargs_encoder = {
...@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel): ...@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):
Examples:: Examples::
from transformers import EncoderDecoderModel, BertTokenizer >>> from transformers import EncoderDecoderModel, BertTokenizer
import torch >>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
# forward >>> # forward
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
# training >>> # training
loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2] >>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]
# generation >>> # generation
generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id) >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
""" """
......
...@@ -22,7 +22,7 @@ import torch ...@@ -22,7 +22,7 @@ import torch
from torch.nn import functional as F from torch.nn import functional as F
from .configuration_flaubert import FlaubertConfig from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_xlm import ( from .modeling_xlm import (
XLMForQuestionAnswering, XLMForQuestionAnswering,
XLMForQuestionAnsweringSimple, XLMForQuestionAnsweringSimple,
...@@ -35,6 +35,8 @@ from .modeling_xlm import ( ...@@ -35,6 +35,8 @@ from .modeling_xlm import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"flaubert/flaubert_small_cased", "flaubert/flaubert_small_cased",
"flaubert/flaubert_base_uncased", "flaubert/flaubert_base_uncased",
...@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel): ...@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
self.pre_norm = getattr(config, "pre_norm", False) self.pre_norm = getattr(config, "pre_norm", False)
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel): ...@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import FlaubertTokenizer, FlaubertModel
import torch
tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
model = FlaubertModel.from_pretrained('flaubert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
......
...@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss ...@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN from .activations import ACT2FN
from .configuration_gpt2 import GPT2Config from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import ( from .modeling_utils import (
Conv1D, Conv1D,
PreTrainedModel, PreTrainedModel,
...@@ -38,6 +38,8 @@ from .modeling_utils import ( ...@@ -38,6 +38,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2", "gpt2",
"gpt2-medium", "gpt2-medium",
...@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
self.h[layer].attn.prune_heads(heads) self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import GPT2Tokenizer, GPT2Model
import torch
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = ( output_hidden_states = (
...@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]} return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
""" """
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
...@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Examples:: Examples::
import torch >>> import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') >>> # Add a [CLS] to the vocabulary (we should train it also!)
model = GPT2DoubleHeadsModel.from_pretrained('gpt2') >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
# Add a [CLS] to the vocabulary (we should train it also!) >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices] >>> encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids) >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2] >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if "lm_labels" in kwargs: if "lm_labels" in kwargs:
......
...@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss ...@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn import functional as F from torch.nn import functional as F
from .configuration_longformer import LongformerConfig from .configuration_longformer import LongformerConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertPreTrainedModel from .modeling_bert import BertPreTrainedModel
from .modeling_roberta import RobertaLMHead, RobertaModel from .modeling_roberta import RobertaLMHead, RobertaModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "LongformerTokenizer"
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"allenai/longformer-base-4096", "allenai/longformer-base-4096",
"allenai/longformer-large-4096", "allenai/longformer-large-4096",
...@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel): ...@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):
Examples:: Examples::
import torch >>> import torch
from transformers import LongformerModel, LongformerTokenizer >>> from transformers import LongformerModel, LongformerTokenizer
model = LongformerModel.from_pretrained('allenai/longformer-base-4096') >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example, >>> attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
# classification: the <s> token ... # classification: the <s> token
# QA: question tokens ... # QA: question tokens
# LM: potentially on the beginning of sentences and paragraphs ... # LM: potentially on the beginning of sentences and paragraphs
sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask) >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel): ...@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):
Examples:: Examples::
import torch >>> import torch
from transformers import LongformerForMaskedLM, LongformerTokenizer >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1 >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM >>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
# check ``LongformerModel.forward`` for more details how to set `attention_mask` ... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids) >>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
""" """
if "masked_lm_labels" in kwargs: if "masked_lm_labels" in kwargs:
...@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel): ...@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel): ...@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
""" """
if global_attention_mask is None: if global_attention_mask is None:
...@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): ...@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
Examples:: Examples::
from transformers import LongformerTokenizer, LongformerForQuestionAnswering >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
import torch >>> import torch
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt") >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
input_ids = encoding["input_ids"] >>> input_ids = encoding["input_ids"]
# default is local attention everywhere >>> # default is local attention everywhere
# the forward method will automatically set global attention on question tokens >>> # the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"] >>> attention_mask = encoding["attention_mask"]
start_scores, end_scores = model(input_ids, attention_mask=attention_mask) >>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1] >>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
""" """
...@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel): ...@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel): ...@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import LongformerTokenizer, LongformerForTokenClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
""" """
outputs = self.longformer( outputs = self.longformer(
...@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel): ...@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
self.init_weights() self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)")) @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward( def forward(
self, self,
input_ids=None, input_ids=None,
...@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel): ...@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples::
from transformers import LongformerTokenizer, LongformerForMultipleChoice
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
# context = "The dog is cute" | choice = "the dog" / "the cat"
choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
# global attention is automatically put on "the dog" and "the cat"
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
""" """
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
...@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration): ...@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):
Examples:: Examples::
from transformers import MarianTokenizer, MarianMTModel >>> from transformers import MarianTokenizer, MarianMTModel
from typing import List >>> from typing import List
src = 'fr' # source language >>> src = 'fr' # source language
trg = 'en' # target language >>> trg = 'en' # target language
sample_text = "où est l'arrêt de bus ?" >>> sample_text = "où est l'arrêt de bus ?"
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}' >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
model = MarianMTModel.from_pretrained(mname) >>> model = MarianMTModel.from_pretrained(mname)
tok = MarianTokenizer.from_pretrained(mname) >>> tok = MarianTokenizer.from_pretrained(mname)
batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference >>> batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
gen = model.generate(**batch) # for forward pass: model(**batch) >>> gen = model.generate(**batch) # for forward pass: model(**batch)
words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?" >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment