Unverified Commit 364a5ae1 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Refactor Code samples; Test code samples (#5036)



* Refactor code samples

* Test docstrings

* Style

* Tokenization examples

* Run rust of tests

* First step to testing source docs

* Style and BART comment

* Test the remainder of the code samples

* Style

* let to const

* Formatting fixes

* Ready for merge

* Fix fixture + Style

* Fix last tests

* Update docs/source/quicktour.rst
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Addressing @sgugger's comments + Fix MobileBERT in TF
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 315f464b
......@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):
Example::
from transformers import ReformerModel, ReformerConfig
>>> from transformers import ReformerModel, ReformerConfig
# Initializing a Reformer configuration
configuration = ReformerConfig()
>>> # Initializing a Reformer configuration
>>> configuration = ReformerConfig()
# Initializing a Reformer model
model = ReformerModel(configuration)
>>> # Initializing a Reformer model
>>> model = ReformerModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "reformer"
......
......@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):
Example::
from transformers import RobertaConfig, RobertaModel
>>> from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration
configuration = RobertaConfig()
>>> # Initializing a RoBERTa configuration
>>> configuration = RobertaConfig()
# Initializing a model from the configuration
model = RobertaModel(configuration)
>>> # Initializing a model from the configuration
>>> model = RobertaModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "roberta"
......
......@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):
Example::
from transformers import TransfoXLConfig, TransfoXLModel
>>> from transformers import TransfoXLConfig, TransfoXLModel
# Initializing a Transformer XL configuration
configuration = TransfoXLConfig()
>>> # Initializing a Transformer XL configuration
>>> configuration = TransfoXLConfig()
# Initializing a model from the configuration
model = TransfoXLModel(configuration)
>>> # Initializing a model from the configuration
>>> model = TransfoXLModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "transfo-xl"
......
......@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):
Example::
from transformers import XLMConfig, XLMModel
>>> from transformers import XLMConfig, XLMModel
# Initializing a XLM configuration
configuration = XLMConfig()
>>> # Initializing a XLM configuration
>>> configuration = XLMConfig()
# Initializing a model from the configuration
model = XLMModel(configuration)
>>> # Initializing a model from the configuration
>>> model = XLMModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "xlm"
......
......@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):
Example::
from transformers import XLNetConfig, XLNetModel
>>> from transformers import XLNetConfig, XLNetModel
# Initializing a XLNet configuration
configuration = XLNetConfig()
>>> # Initializing a XLNet configuration
>>> configuration = XLNetConfig()
# Initializing a model from the configuration
model = XLNetModel(configuration)
>>> # Initializing a model from the configuration
>>> model = XLNetModel(configuration)
# Accessing the model configuration
configuration = model.config
>>> # Accessing the model configuration
>>> configuration = model.config
"""
model_type = "xlnet"
......
......@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):
Examples::
import tensorflow_datasets as tfds
dataset = tfds.load("squad")
>>> import tensorflow_datasets as tfds
>>> dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
>>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
>>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
"""
if evaluate:
......
......@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
return docstring_decorator
PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, scores = outputs[:2]
"""
PT_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
>>> loss, start_scores, end_scores = outputs[:3]
"""
PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels)
>>> loss, logits = outputs[:2]
"""
PT_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
>>> outputs = model(input_ids, labels=input_ids)
>>> loss, prediction_scores = outputs[:2]
"""
PT_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
PT_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import torch
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> loss, logits = outputs[:2]
"""
PT_CAUSAL_LM_SAMPLE = r"""
Example::
>>> import torch
>>> from transformers import {tokenizer_class}, {model_class}
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs, labels=inputs["input_ids"])
>>> loss, logits = outputs[:2]
"""
TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> input_ids = inputs["input_ids"]
>>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
>>> outputs = model(inputs)
>>> loss, scores = outputs[:2]
"""
TF_QUESTION_ANSWERING_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> input_dict = tokenizer(question, text, return_tensors='tf')
>>> start_scores, end_scores = model(input_dict)
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
>>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
"""
TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
>>> outputs = model(inputs)
>>> loss, logits = outputs[:2]
"""
TF_MASKED_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores = outputs[0]
"""
TF_BASE_MODEL_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
TF_MULTIPLE_CHOICE_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> logits = outputs[0]
"""
TF_CAUSAL_LM_SAMPLE = r"""
Example::
>>> from transformers import {tokenizer_class}, {model_class}
>>> import tensorflow as tf
>>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
>>> model = {model_class}.from_pretrained('{checkpoint}')
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs)
>>> logits = outputs[0]
"""
def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
def docstring_decorator(fn):
model_class = fn.__qualname__.split(".")[0]
is_tf_class = model_class[:2] == "TF"
if "SequenceClassification" in model_class:
code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
elif "QuestionAnswering" in model_class:
code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
elif "TokenClassification" in model_class:
code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
elif "MultipleChoice" in model_class:
code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
elif "MaskedLM" in model_class:
code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
elif "LMHead" in model_class:
code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
elif "Model" in model_class:
code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
else:
raise ValueError(f"Docstring can't be built for model {model_class}")
built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
return fn
return docstring_decorator
def is_remote_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https")
......
......@@ -24,13 +24,15 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from .configuration_albert import AlbertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AlbertTokenizer"
ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"albert-base-v1",
......@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example::
from transformers import AlbertModel, AlbertTokenizer
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
Examples::
from transformers import AlbertTokenizer, AlbertForPreTraining
import torch
>>> from transformers import AlbertTokenizer, AlbertForPreTraining
>>> import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForPreTraining.from_pretrained('albert-base-v2')
>>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
>>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
prediction_scores, sop_scores = outputs[:2]
>>> prediction_scores, sop_scores = outputs[:2]
"""
......@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
return self.predictions.decoder
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Example::
from transformers import AlbertTokenizer, AlbertForMaskedLM
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.albert(
......@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForTokenClassification
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.albert(
......@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
start_scores, end_scores = model(**input_dict)
"""
outputs = self.albert(
......@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
def forward(
self,
input_ids=None,
......@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import AlbertTokenizer, AlbertForMultipleChoice
import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -392,8 +392,8 @@ class AutoModel:
Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
>>> model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in MODEL_MAPPING.items():
if isinstance(config, config_class):
......@@ -480,8 +480,7 @@ class AutoModel:
Examples::
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
assert model.config.output_attention == True
assert model.config.output_attentions == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
......@@ -547,8 +546,8 @@ class AutoModelForPreTraining:
Examples::
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
>>> model = AutoModelForPreTraining.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')`
"""
for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
if isinstance(config, config_class):
......
......@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN
from .configuration_bart import BartConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_callable,
)
from .modeling_utils import PreTrainedModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BartTokenizer"
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
"facebook/bart-large",
......@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""
"""
BART_GENERATION_EXAMPLE = r"""
Examples::
Summarization example::
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
# see ``examples/summarization/bart/run_eval.py`` for a longer example
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
......@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
self.init_weights()
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
......@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):
@add_start_docstrings(
"The BART Model with a language modeling head. Can be used for summarization.",
BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
"The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
)
class BartForConditionalGeneration(PretrainedBartModel):
base_model_prefix = "model"
......@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
self.register_buffer("final_logits_bias", new_bias)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def forward(
self,
input_ids,
......@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
Conditional generation example::
# Mask filling only works for bart-large
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained('bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
TXT = "My friends are <mask> but they eat too many carbs."
model = BartForConditionalGeneration.from_pretrained('bart-large')
input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
logits = model(input_ids)[0]
masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
probs = logits[0, masked_index].softmax(dim=0)
values, predictions = probs.topk(5)
tokenizer.decode(predictions).split()
# ['good', 'great', 'all', 'really', 'very']
"""
......@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
self.model._init_weights(self.classification_head.out_proj)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
......@@ -1092,28 +1107,15 @@ class BartForSequenceClassification(PretrainedBartModel):
Classification loss (cross entropy)
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
Attentions weights after the attention softmax, used to compute the weighted average in the
self-attention
heads.
Examples::
from transformers import BartTokenizer, BartForSequenceClassification
import torch
tokenizer = BartTokenizer.from_pretrained('bart-large')
model = BartForSequenceClassification.from_pretrained('bart-large')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if labels is not None:
use_cache = False
......@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
self.model._init_weights(self.qa_outputs)
@add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
def forward(
self,
input_ids,
......@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
# The checkpoint bart-large is not fine-tuned for question answering. Please see the
# examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
from transformers import BartTokenizer, BartForQuestionAnswering
import torch
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
start_scores, end_scores = model(torch.tensor([input_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
if start_positions is not None and end_positions is not None:
use_cache = False
......@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
return outputs # return outputs # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)
class SinusoidalPositionalEmbedding(nn.Embedding):
......
......@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import gelu, gelu_new, swish
from .configuration_bert import BertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "BertTokenizer"
BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bert-base-uncased",
"bert-large-uncased",
......@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertModel, BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):
Examples::
from transformers import BertTokenizer, BertForPreTraining
import torch
>>> from transformers import BertTokenizer, BertForPreTraining
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> prediction_scores, seq_relationship_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
......@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertLMHeadModel
import torch
Example::
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True)
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
>>> import torch
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
>>> config = BertConfig.from_pretrained("bert-base-cased")
>>> config.is_decoder = True
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
loss, prediction_scores = outputs[:2]
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.bert(
......@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
return self.cls.predictions.decoder
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForMaskedLM
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
Examples::
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
>>> from transformers import BertTokenizer, BertForNextSentencePrediction
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
assert logits[0, 0] < logits[0, 1] # next sentence was random
>>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
>>> assert logits[0, 0] < logits[0, 1] # next sentence was random
"""
outputs = self.bert(
......@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
outputs = self.bert(
......@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForMultipleChoice
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForTokenClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.bert(
......@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
"""
outputs = self.bert(
......
......@@ -31,6 +31,8 @@ from .modeling_roberta import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CamembertTokenizer"
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"camembert-base",
"Musixmatch/umberto-commoncrawl-cased-v1",
......
......@@ -24,12 +24,14 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss
from .configuration_ctrl import CTRLConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "CTRLTokenizer"
CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
"ctrl"
# See all CTRL models at https://huggingface.co/models?filter=ctrl
......@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
self.h[layer].multi_head_attention.prune_heads(heads)
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward(
self,
input_ids=None,
......@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import CTRLTokenizer, CTRLModel
import torch
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
use_cache = use_cache if use_cache is not None else self.config.use_cache
......@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
def forward(
self,
input_ids=None,
......@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl')
input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
......
......@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss
from .activations import gelu
from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"distilbert-base-uncased",
......@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.transformer.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertModel
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertModel.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return self.vocab_projector
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
distilbert_output = self.distilbert(
input_ids=input_ids,
......@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:3]
"""
distilbert_output = self.distilbert(
input_ids=input_ids,
......@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
def forward(
self,
input_ids=None,
......@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.distilbert(
......@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
Examples::
from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
import torch
>>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
>>> import torch
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
>>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained
>>> loss, logits = outputs[:2]
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
from .activations import get_activation
from .configuration_electra import ElectraConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
from .modeling_utils import SequenceSummary
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "ElectraTokenizer"
ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
......@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
......@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraModel, ElectraTokenizer
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
......@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
discriminator_hidden_states = self.electra(
input_ids,
......@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
Examples::
from transformers import ElectraTokenizer, ElectraForPreTraining
import torch
>>> from transformers import ElectraTokenizer, ElectraForPreTraining
>>> import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
>>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
>>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
prediction_scores, seq_relationship_scores = outputs[:2]
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> scores = model(input_ids)[0]
"""
......@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
def forward(
self,
input_ids=None,
......@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMaskedLM
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2]
"""
if "masked_lm_labels" in kwargs:
warnings.warn(
......@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
......@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForTokenClassification
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
discriminator_hidden_states = self.electra(
......@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
......@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
"""
discriminator_hidden_states = self.electra(
......@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
def forward(
self,
input_ids=None,
......@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import ElectraTokenizer, ElectraForMultipleChoice
import torch
tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
# the linear classifier still needs to be trained
loss, logits = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):
Examples::
from transformers import EncoderDecoder
model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
>>> from transformers import EncoderDecoderModel
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
"""
kwargs_encoder = {
......@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):
Examples::
from transformers import EncoderDecoderModel, BertTokenizer
import torch
>>> from transformers import EncoderDecoderModel, BertTokenizer
>>> import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
# forward
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
>>> # forward
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
# training
loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]
>>> # training
>>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]
# generation
generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
>>> # generation
>>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
"""
......
......@@ -22,7 +22,7 @@ import torch
from torch.nn import functional as F
from .configuration_flaubert import FlaubertConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_xlm import (
XLMForQuestionAnswering,
XLMForQuestionAnsweringSimple,
......@@ -35,6 +35,8 @@ from .modeling_xlm import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
"flaubert/flaubert_small_cased",
"flaubert/flaubert_base_uncased",
......@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
self.pre_norm = getattr(config, "pre_norm", False)
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
def forward(
self,
input_ids=None,
......@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import FlaubertTokenizer, FlaubertModel
import torch
tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
model = FlaubertModel.from_pretrained('flaubert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......
......@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss
from .activations import ACT2FN
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_utils import (
Conv1D,
PreTrainedModel,
......@@ -38,6 +38,8 @@ from .modeling_utils import (
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"gpt2",
"gpt2-medium",
......@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
self.h[layer].attn.prune_heads(heads)
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward(
self,
input_ids=None,
......@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import GPT2Tokenizer, GPT2Model
import torch
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
......@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
@add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
def forward(
self,
input_ids=None,
......@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
transformer_outputs = self.transformer(
input_ids,
......@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Examples::
import torch
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
>>> import torch
>>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
>>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
>>> # Add a [CLS] to the vocabulary (we should train it also!)
>>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
# Add a [CLS] to the vocabulary (we should train it also!)
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
>>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
>>> encoded_choices = [tokenizer.encode(s) for s in choices]
>>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
>>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2
>>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
>>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
if "lm_labels" in kwargs:
......
......@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn import functional as F
from .configuration_longformer import LongformerConfig
from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
from .modeling_bert import BertPreTrainedModel
from .modeling_roberta import RobertaLMHead, RobertaModel
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "LongformerTokenizer"
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
......@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):
Examples::
import torch
from transformers import LongformerModel, LongformerTokenizer
>>> import torch
>>> from transformers import LongformerModel, LongformerTokenizer
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
>>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
# classification: the <s> token
# QA: question tokens
# LM: potentially on the beginning of sentences and paragraphs
sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
>>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
>>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
>>> attention_mask[:, [1, 4, 21,]] = 2 # Set global attention based on the task. For example,
... # classification: the <s> token
... # QA: question tokens
... # LM: potentially on the beginning of sentences and paragraphs
>>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):
Examples::
import torch
from transformers import LongformerForMaskedLM, LongformerTokenizer
>>> import torch
>>> from transformers import LongformerForMaskedLM, LongformerTokenizer
model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
>>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
>>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
>>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document
>>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0) # batch of size 1
attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
# check ``LongformerModel.forward`` for more details how to set `attention_mask`
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
>>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
... # check ``LongformerModel.forward`` for more details how to set `attention_mask`
>>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
"""
if "masked_lm_labels" in kwargs:
......@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
......@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]
"""
if global_attention_mask is None:
......@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
Examples::
from transformers import LongformerTokenizer, LongformerForQuestionAnswering
import torch
>>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
>>> import torch
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
>>> input_ids = encoding["input_ids"]
# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]
>>> # default is local attention everywhere
>>> # the forward method will automatically set global attention on question tokens
>>> attention_mask = encoding["attention_mask"]
start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
>>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
>>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
>>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
"""
......@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
......@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForTokenClassification
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, scores = outputs[:2]
"""
outputs = self.longformer(
......@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
self.init_weights()
@add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
@add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
def forward(
self,
input_ids=None,
......@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
Examples::
from transformers import LongformerTokenizer, LongformerForMultipleChoice
import torch
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
# context = "The dog is cute" | choice = "the dog" / "the cat"
choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
# global attention is automatically put on "the dog" and "the cat"
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
......
......@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):
Examples::
from transformers import MarianTokenizer, MarianMTModel
from typing import List
src = 'fr' # source language
trg = 'en' # target language
sample_text = "où est l'arrêt de bus ?"
mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
model = MarianMTModel.from_pretrained(mname)
tok = MarianTokenizer.from_pretrained(mname)
batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
gen = model.generate(**batch) # for forward pass: model(**batch)
words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
>>> from transformers import MarianTokenizer, MarianMTModel
>>> from typing import List
>>> src = 'fr' # source language
>>> trg = 'en' # target language
>>> sample_text = "où est l'arrêt de bus ?"
>>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
>>> model = MarianMTModel.from_pretrained(mname)
>>> tok = MarianTokenizer.from_pretrained(mname)
>>> batch = tok.prepare_translation_batch(src_texts=[sample_text]) # don't need tgt_text for inference
>>> gen = model.generate(**batch) # for forward pass: model(**batch)
>>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True) # returns "Where is the the bus stop ?"
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment