Unverified Commit b5e2b183 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
parent e13f72fb
......@@ -58,17 +58,17 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
>>> # Accessing the model configuration
>>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder
>>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True
>>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model')
>>> model.save_pretrained("my-model")
>>> # loading model and config from pretrained folder
>>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model')
>>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
>>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model")
>>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
```"""
model_type = "vision-encoder-decoder"
is_composition = True
......
......@@ -397,13 +397,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values)
......@@ -474,13 +474,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values)
......@@ -601,16 +601,16 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # load output tokenizer
>>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
>>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
......@@ -746,8 +746,11 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
```python
>>> from transformers import FlaxVisionEncoderDecoderModel
>>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'gpt2')
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-gpt2")
>>> # load fine-tuned model
......
......@@ -302,8 +302,11 @@ class VisionEncoderDecoderModel(PreTrainedModel):
```python
>>> from transformers import VisionEncoderDecoderModel
>>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'bert-base-uncased')
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "bert-base-uncased"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model
......@@ -417,8 +420,8 @@ class VisionEncoderDecoderModel(PreTrainedModel):
>>> from PIL import Image
>>> import torch
>>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
>>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
>>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
......
......@@ -61,15 +61,15 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
>>> model = VisionTextDualEncoderModel(config=config)
>>> # Accessing the model configuration
>>> config_vision = model.config.vision_config
>>> config_vision = model.config.vision_config
>>> config_text = model.config.text_config
>>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model')
>>> model.save_pretrained("my-model")
>>> # loading model and config from pretrained folder
>>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert')
>>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config)
>>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained("vit-bert")
>>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert", config=vision_text_config)
```"""
model_type = "vision-text-dual-encoder"
......
......@@ -446,12 +446,15 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
```python
>>> from transformers import FlaxVisionTextDualEncoderModel
>>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224')
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "bert-base-uncased", "google/vit-base-patch16-224"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model
>>> model = FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert")
>>> model = FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert")
```"""
kwargs_vision = {
......@@ -531,19 +534,36 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> from PIL import Image
>>> import requests
>>> import jax
>>> from transformers import FlaxVisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
>>> from transformers import (
... FlaxVisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor,
... BertTokenizer,
... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased"
... )
>>> # contrastive training
>>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
>>> urls = [
... "http://images.cocodataset.org/val2017/000000039769.jpg",
... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
... ]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True)
>>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True
... )
>>> outputs = model(
... input_ids=inputs.input_ids,
... attention_mask=inputs.attention_mask,
... pixel_values=inputs.pixel_values,
... return_loss=True,
... )
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> # save and load from pretrained
>>> model.save_pretrained("vit-bert")
......@@ -551,8 +571,8 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> # inference
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
```
"""
......
......@@ -231,7 +231,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
>>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian")
>>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian")
>>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt")
>>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```"""
text_outputs = self.text_model(
......@@ -312,19 +312,36 @@ class VisionTextDualEncoderModel(PreTrainedModel):
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
>>> from transformers import (
... VisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor,
... BertTokenizer,
... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased"
... )
>>> # contrastive training
>>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
>>> urls = [
... "http://images.cocodataset.org/val2017/000000039769.jpg",
... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
... ]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)
>>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True
... )
>>> outputs = model(
... input_ids=inputs.input_ids,
... attention_mask=inputs.attention_mask,
... pixel_values=inputs.pixel_values,
... return_loss=True,
... )
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> # save and load from pretrained
>>> model.save_pretrained("vit-bert")
......@@ -332,8 +349,8 @@ class VisionTextDualEncoderModel(PreTrainedModel):
>>> # inference
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
```"""
return_dict = return_dict if return_dict is not None else self.config.return_dict
......@@ -447,8 +464,11 @@ class VisionTextDualEncoderModel(PreTrainedModel):
```python
>>> from transformers import VisionTextDualEncoderModel
>>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224')
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
... "bert-base-uncased", "google/vit-base-patch16-224"
... )
>>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model
......
......@@ -93,7 +93,7 @@ class VisualBertConfig(PretrainedConfig):
>>> from transformers import VisualBertModel, VisualBertConfig
>>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
>>> configuration = VisualBertConfig.from_pretrained('visualbert-vqa-coco-pre')
>>> configuration = VisualBertConfig.from_pretrained("visualbert-vqa-coco-pre")
>>> # Initializing a model from the visualbert-vqa-coco-pre style configuration
>>> model = VisualBertModel(configuration)
......
......@@ -745,19 +745,21 @@ class VisualBertModel(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})
inputs.update(
{
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask,
}
)
outputs = model(**inputs)
......@@ -927,22 +929,26 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
# Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
from transformers import BertTokenizer, VisualBertForPreTraining
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})
max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
inputs.update(
{
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask,
}
)
max_length = inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]
labels = tokenizer(
"The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length
)["input_ids"]
sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
......@@ -1063,8 +1069,8 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForMultipleChoice
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertForMultipleChoice.from_pretrained('uclanlp/visualbert-vcr')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
......@@ -1078,15 +1084,17 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True)
# batch size is 1
inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
inputs_dict.update({
"visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask,
"visual_token_type_ids": visual_token_type_ids,
"labels": labels
})
inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
inputs_dict.update(
{
"visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask,
"visual_token_type_ids": visual_token_type_ids,
"labels": labels,
}
)
outputs = model(**inputs_dict)
loss = outputs.loss
......@@ -1212,22 +1220,24 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForQuestionAnswering
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt')
inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})
inputs.update(
{
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask,
}
)
labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
outputs = model(**inputs, labels=labels)
loss = outputs.loss
......@@ -1336,20 +1346,22 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForVisualReasoning
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt')
inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})
inputs.update(
{
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask,
}
)
labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
......@@ -1498,24 +1510,28 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = VisualBertForRegionToPhraseAlignment.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt')
inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
inputs.update({
"region_to_phrase_position": region_to_phrase_position,
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask
})
region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]))
inputs.update(
{
"region_to_phrase_position": region_to_phrase_position,
"visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids,
"visual_attention_mask": visual_attention_mask,
}
)
labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
labels = torch.ones(
(1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2], visual_embeds.shape[-2])
) # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
......
......@@ -521,11 +521,11 @@ FLAX_VISION_MODEL_DOCSTRING = """
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> model = FlaxViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
......@@ -603,11 +603,11 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
>>> import jax
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
>>> model = FlaxViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="np")
>>> outputs = model(**inputs)
......
......@@ -682,11 +682,11 @@ class TFViTModel(TFViTPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
......@@ -803,11 +803,11 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
>>> model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> outputs = model(**inputs)
......
......@@ -522,11 +522,11 @@ class ViTModel(ViTPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
>>> model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
......@@ -634,11 +634,11 @@ class ViTForImageClassification(ViTPreTrainedModel):
>>> from PIL import Image
>>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
>>> model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
......
......@@ -952,15 +952,19 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech
>>> return batch
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values # Batch size 1
>>> input_values = processor(
... ds["speech"][0], sampling_rate=16_000, return_tensors="np"
>>> ).input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state
```
"""
......@@ -1055,15 +1059,19 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech
>>> return batch
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values # Batch size 1
>>> input_values = processor(
... ds["speech"][0], sampling_rate=16_000, return_tensors="np"
>>> ).input_values # Batch size 1
>>> logits = model(input_values).logits
>>> predicted_ids = jnp.argmax(logits, axis=-1)
......@@ -1264,9 +1272,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> outputs = model(input_values, mask_time_indices=mask_time_indices)
>>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
>>> cosine_sim = optax.cosine_similarity(
... outputs.projected_states, outputs.projected_quantized_states
... )
>>> cosine_sim = optax.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states)
>>> # show that cosine similarity is much higher than random
>>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
......
......@@ -1408,10 +1408,12 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech
>>> return batch
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
......@@ -1519,15 +1521,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech
>>> return batch
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
>>> logits = model(input_values).logits
>>> predicted_ids = tf.argmax(logits, axis=-1)
......@@ -1538,7 +1542,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> # wrap processor as target processor to encode labels
>>> with processor.as_target_processor():
>>> labels = processor(transcription, return_tensors="tf").input_ids
... labels = processor(transcription, return_tensors="tf").input_ids
>>> loss = model(input_values, labels=labels).loss
```"""
......
......@@ -1421,9 +1421,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
... outputs = model(input_values, mask_time_indices=mask_time_indices)
>>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
>>> cosine_sim = torch.cosine_similarity(
... outputs.projected_states, outputs.projected_quantized_states, dim=-1
... )
>>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
>>> # show that cosine similarity is much higher than random
>>> assert cosine_sim[mask_time_indices].mean() > 0.5
......@@ -1568,10 +1566,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech
>>> return batch
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
......
......@@ -298,11 +298,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
```python
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
print('We have added', num_added_toks, 'tokens')
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print("We have added", num_added_toks, "tokens")
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
```"""
......
......@@ -370,11 +370,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
```python
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
print('We have added', num_added_toks, 'tokens')
num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print("We have added", num_added_toks, "tokens")
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
```"""
......
......@@ -1042,10 +1042,12 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
>>> tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
>>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
... 0
>>> ) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
......
......@@ -46,8 +46,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone')
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
......@@ -69,8 +69,10 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False)
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetDecoder.from_pretrained(
... "patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False
... )
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
......@@ -91,10 +93,12 @@ class XLMProphetNetModel(ProphetNetModel):
```python
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetModel.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetModel.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
......@@ -115,10 +119,12 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
```python
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1
>>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
......@@ -140,8 +146,8 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
>>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = XLMProphetNetForCausalLM.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetForCausalLM.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
......@@ -152,14 +158,16 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
>>> from transformers import EncoderDecoderModel, XLMProphetNetTokenizer, XLMRobertaTokenizer
>>> import torch
>>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
>>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-large", 'microsoft/xprophetnet-large-wiki100-cased')
>>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
>>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
... "xlm-roberta-large", "microsoft/xprophetnet-large-wiki100-cased"
... )
>>> ARTICLE = (
... "the us state department said wednesday it had received no "
... "formal word from bolivia that it was expelling the us ambassador there "
... "but said the charges made against him are `` baseless ."
... "the us state department said wednesday it had received no "
... "formal word from bolivia that it was expelling the us ambassador there "
... "but said the charges made against him are `` baseless ."
... )
>>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
>>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
......
......@@ -1321,21 +1321,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
>>> import numpy as np
>>> from transformers import XLNetTokenizer, TFXLNetLMHeadModel
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
>>> model = TFXLNetLMHeadModel.from_pretrained("xlnet-large-cased")
>>> # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[
... None, :
>>> ] # We will predict the masked token
>>> perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
>>> target_mapping = np.zeros(
... (1, 1, input_ids.shape[1])
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[
... 0, 0, -1
>>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(
... input_ids,
... perm_mask=tf.constant(perm_mask, dtype=tf.float32),
... target_mapping=tf.constant(target_mapping, dtype=tf.float32),
... )
>>> next_token_logits = outputs[
... 0
>>> ] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
```"""
inputs = input_processing(
func=self.call,
......
......@@ -1400,31 +1400,53 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
>>> from transformers import XLNetTokenizer, XLNetLMHeadModel
>>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
>>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
>>> model = XLNetLMHeadModel.from_pretrained("xlnet-large-cased")
>>> # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
>>> input_ids = torch.tensor(
... tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
>>> ).unsqueeze(
... 0
>>> ) # We will predict the masked token
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> target_mapping = torch.zeros(
... (1, 1, input_ids.shape[1]), dtype=torch.float
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[
... 0, 0, -1
>>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
>>> next_token_logits = outputs[
... 0
>>> ] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
>>> # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token
>>> input_ids = torch.tensor(
... tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
>>> ).unsqueeze(
... 0
>>> ) # We will predict the masked token
>>> labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
>>> assert labels.shape[0] == 1, 'only one word will be predicted'
>>> assert labels.shape[0] == 1, "only one word will be predicted"
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> perm_mask[
... :, :, -1
>>> ] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training
>>> target_mapping = torch.zeros(
... (1, 1, input_ids.shape[1]), dtype=torch.float
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[
... 0, 0, -1
>>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
>>> loss = outputs.loss
>>> next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
>>> next_token_logits = (
... outputs.logits
>>> ) # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......@@ -1968,10 +1990,12 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
>>> model = XLNetForQuestionAnswering.from_pretrained("xlnet-base-cased")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
... 0
>>> ) # Batch size 1
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment