Unverified Commit b5e2b183 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
parent e13f72fb
...@@ -58,17 +58,17 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ...@@ -58,17 +58,17 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> config_encoder = model.config.encoder >>> config_encoder = model.config.encoder
>>> config_decoder = model.config.decoder >>> config_decoder = model.config.decoder
>>> # set decoder config to causal lm >>> # set decoder config to causal lm
>>> config_decoder.is_decoder = True >>> config_decoder.is_decoder = True
>>> config_decoder.add_cross_attention = True >>> config_decoder.add_cross_attention = True
>>> # Saving the model, including its configuration >>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model') >>> model.save_pretrained("my-model")
>>> # loading model and config from pretrained folder >>> # loading model and config from pretrained folder
>>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model') >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model")
>>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config) >>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
```""" ```"""
model_type = "vision-encoder-decoder" model_type = "vision-encoder-decoder"
is_composition = True is_composition = True
......
...@@ -397,13 +397,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -397,13 +397,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2') >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values) >>> encoder_outputs = model.encode(pixel_values)
...@@ -474,13 +474,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -474,13 +474,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2') >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
>>> encoder_outputs = model.encode(pixel_values) >>> encoder_outputs = model.encode(pixel_values)
...@@ -601,16 +601,16 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -601,16 +601,16 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> # load output tokenizer >>> # load output tokenizer
>>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2') >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
>>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2') >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
>>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
...@@ -746,8 +746,11 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel): ...@@ -746,8 +746,11 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
```python ```python
>>> from transformers import FlaxVisionEncoderDecoderModel >>> from transformers import FlaxVisionEncoderDecoderModel
>>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized
>>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'gpt2') >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "gpt2"
... )
>>> # saving model after fine-tuning >>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-gpt2") >>> model.save_pretrained("./vit-gpt2")
>>> # load fine-tuned model >>> # load fine-tuned model
......
...@@ -302,8 +302,11 @@ class VisionEncoderDecoderModel(PreTrainedModel): ...@@ -302,8 +302,11 @@ class VisionEncoderDecoderModel(PreTrainedModel):
```python ```python
>>> from transformers import VisionEncoderDecoderModel >>> from transformers import VisionEncoderDecoderModel
>>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
>>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'bert-base-uncased') >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
... "google/vit-base-patch16-224-in21k", "bert-base-uncased"
... )
>>> # saving model after fine-tuning >>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert") >>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model >>> # load fine-tuned model
...@@ -417,8 +420,8 @@ class VisionEncoderDecoderModel(PreTrainedModel): ...@@ -417,8 +420,8 @@ class VisionEncoderDecoderModel(PreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import torch >>> import torch
>>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten') >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten') >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
>>> # load image from the IAM dataset >>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
......
...@@ -61,15 +61,15 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ...@@ -61,15 +61,15 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
>>> model = VisionTextDualEncoderModel(config=config) >>> model = VisionTextDualEncoderModel(config=config)
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> config_vision = model.config.vision_config >>> config_vision = model.config.vision_config
>>> config_text = model.config.text_config >>> config_text = model.config.text_config
>>> # Saving the model, including its configuration >>> # Saving the model, including its configuration
>>> model.save_pretrained('my-model') >>> model.save_pretrained("my-model")
>>> # loading model and config from pretrained folder >>> # loading model and config from pretrained folder
>>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert') >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained("vit-bert")
>>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config) >>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert", config=vision_text_config)
```""" ```"""
model_type = "vision-text-dual-encoder" model_type = "vision-text-dual-encoder"
......
...@@ -446,12 +446,15 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel): ...@@ -446,12 +446,15 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
```python ```python
>>> from transformers import FlaxVisionTextDualEncoderModel >>> from transformers import FlaxVisionTextDualEncoderModel
>>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized. >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224') >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "bert-base-uncased", "google/vit-base-patch16-224"
... )
>>> # saving model after fine-tuning >>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert") >>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model >>> # load fine-tuned model
>>> model = FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert") >>> model = FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert")
```""" ```"""
kwargs_vision = { kwargs_vision = {
...@@ -531,19 +534,36 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r""" ...@@ -531,19 +534,36 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> import jax >>> import jax
>>> from transformers import FlaxVisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer >>> from transformers import (
... FlaxVisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor,
... BertTokenizer,
... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer) >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
>>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased") >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased"
... )
>>> # contrastive training >>> # contrastive training
>>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg] >>> urls = [
... "http://images.cocodataset.org/val2017/000000039769.jpg",
... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
... ]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls] >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True) >>> inputs = processor(
>>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True) ... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score ... )
>>> outputs = model(
... input_ids=inputs.input_ids,
... attention_mask=inputs.attention_mask,
... pixel_values=inputs.pixel_values,
... return_loss=True,
... )
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> # save and load from pretrained >>> # save and load from pretrained
>>> model.save_pretrained("vit-bert") >>> model.save_pretrained("vit-bert")
...@@ -551,8 +571,8 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r""" ...@@ -551,8 +571,8 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
>>> # inference >>> # inference
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
``` ```
""" """
......
...@@ -231,7 +231,7 @@ class VisionTextDualEncoderModel(PreTrainedModel): ...@@ -231,7 +231,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
>>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian") >>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian")
>>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian") >>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian")
>>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt") >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs) >>> text_features = model.get_text_features(**inputs)
```""" ```"""
text_outputs = self.text_model( text_outputs = self.text_model(
...@@ -312,19 +312,36 @@ class VisionTextDualEncoderModel(PreTrainedModel): ...@@ -312,19 +312,36 @@ class VisionTextDualEncoderModel(PreTrainedModel):
```python ```python
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer >>> from transformers import (
... VisionTextDualEncoderModel,
... VisionTextDualEncoderProcessor,
... ViTFeatureExtractor,
... BertTokenizer,
... )
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer) >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased") >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
... "google/vit-base-patch16-224", "bert-base-uncased"
... )
>>> # contrastive training >>> # contrastive training
>>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg] >>> urls = [
... "http://images.cocodataset.org/val2017/000000039769.jpg",
... "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
... ]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls] >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True) >>> inputs = processor(
>>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True) ... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score ... )
>>> outputs = model(
... input_ids=inputs.input_ids,
... attention_mask=inputs.attention_mask,
... pixel_values=inputs.pixel_values,
... return_loss=True,
... )
>>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
>>> # save and load from pretrained >>> # save and load from pretrained
>>> model.save_pretrained("vit-bert") >>> model.save_pretrained("vit-bert")
...@@ -332,8 +349,8 @@ class VisionTextDualEncoderModel(PreTrainedModel): ...@@ -332,8 +349,8 @@ class VisionTextDualEncoderModel(PreTrainedModel):
>>> # inference >>> # inference
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
```""" ```"""
return_dict = return_dict if return_dict is not None else self.config.return_dict return_dict = return_dict if return_dict is not None else self.config.return_dict
...@@ -447,8 +464,11 @@ class VisionTextDualEncoderModel(PreTrainedModel): ...@@ -447,8 +464,11 @@ class VisionTextDualEncoderModel(PreTrainedModel):
```python ```python
>>> from transformers import VisionTextDualEncoderModel >>> from transformers import VisionTextDualEncoderModel
>>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized. >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
>>> model = VisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224') >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
... "bert-base-uncased", "google/vit-base-patch16-224"
... )
>>> # saving model after fine-tuning >>> # saving model after fine-tuning
>>> model.save_pretrained("./vit-bert") >>> model.save_pretrained("./vit-bert")
>>> # load fine-tuned model >>> # load fine-tuned model
......
...@@ -93,7 +93,7 @@ class VisualBertConfig(PretrainedConfig): ...@@ -93,7 +93,7 @@ class VisualBertConfig(PretrainedConfig):
>>> from transformers import VisualBertModel, VisualBertConfig >>> from transformers import VisualBertModel, VisualBertConfig
>>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
>>> configuration = VisualBertConfig.from_pretrained('visualbert-vqa-coco-pre') >>> configuration = VisualBertConfig.from_pretrained("visualbert-vqa-coco-pre")
>>> # Initializing a model from the visualbert-vqa-coco-pre style configuration >>> # Initializing a model from the visualbert-vqa-coco-pre style configuration
>>> model = VisualBertModel(configuration) >>> model = VisualBertModel(configuration)
......
...@@ -745,19 +745,21 @@ class VisualBertModel(VisualBertPreTrainedModel): ...@@ -745,19 +745,21 @@ class VisualBertModel(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertModel from transformers import BertTokenizer, VisualBertModel
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre') model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
inputs = tokenizer("The capital of France is Paris.", return_tensors="pt") inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({ inputs.update(
"visual_embeds": visual_embeds, {
"visual_token_type_ids": visual_token_type_ids, "visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask "visual_token_type_ids": visual_token_type_ids,
}) "visual_attention_mask": visual_attention_mask,
}
)
outputs = model(**inputs) outputs = model(**inputs)
...@@ -927,22 +929,26 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel): ...@@ -927,22 +929,26 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
# Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch. # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
from transformers import BertTokenizer, VisualBertForPreTraining from transformers import BertTokenizer, VisualBertForPreTraining
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre') model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({ inputs.update(
"visual_embeds": visual_embeds, {
"visual_token_type_ids": visual_token_type_ids, "visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask "visual_token_type_ids": visual_token_type_ids,
}) "visual_attention_mask": visual_attention_mask,
max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2] }
labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"] )
sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size max_length = inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]
labels = tokenizer(
"The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length
)["input_ids"]
sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels) outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
...@@ -1063,8 +1069,8 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel): ...@@ -1063,8 +1069,8 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForMultipleChoice from transformers import BertTokenizer, VisualBertForMultipleChoice
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForMultipleChoice.from_pretrained('uclanlp/visualbert-vcr') model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife." choice0 = "It is eaten with a fork and a knife."
...@@ -1078,15 +1084,17 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel): ...@@ -1078,15 +1084,17 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True) encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True)
# batch size is 1 # batch size is 1
inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()} inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
inputs_dict.update({ inputs_dict.update(
"visual_embeds": visual_embeds, {
"visual_attention_mask": visual_attention_mask, "visual_embeds": visual_embeds,
"visual_token_type_ids": visual_token_type_ids, "visual_attention_mask": visual_attention_mask,
"labels": labels "visual_token_type_ids": visual_token_type_ids,
}) "labels": labels,
}
)
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
loss = outputs.loss loss = outputs.loss
...@@ -1212,22 +1220,24 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel): ...@@ -1212,22 +1220,24 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForQuestionAnswering from transformers import BertTokenizer, VisualBertForQuestionAnswering
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa') model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
text = "Who is eating the apple?" text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt') inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({ inputs.update(
"visual_embeds": visual_embeds, {
"visual_token_type_ids": visual_token_type_ids, "visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask "visual_token_type_ids": visual_token_type_ids,
}) "visual_attention_mask": visual_attention_mask,
}
)
labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2 labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
outputs = model(**inputs, labels=labels) outputs = model(**inputs, labels=labels)
loss = outputs.loss loss = outputs.loss
...@@ -1336,20 +1346,22 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel): ...@@ -1336,20 +1346,22 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForVisualReasoning from transformers import BertTokenizer, VisualBertForVisualReasoning
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2') model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
text = "Who is eating the apple?" text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt') inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update({ inputs.update(
"visual_embeds": visual_embeds, {
"visual_token_type_ids": visual_token_type_ids, "visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask "visual_token_type_ids": visual_token_type_ids,
}) "visual_attention_mask": visual_attention_mask,
}
)
labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2 labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
...@@ -1498,24 +1510,28 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): ...@@ -1498,24 +1510,28 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForRegionToPhraseAlignment.from_pretrained('uclanlp/visualbert-vqa-coco-pre') model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
text = "Who is eating the apple?" text = "Who is eating the apple?"
inputs = tokenizer(text, return_tensors='pt') inputs = tokenizer(text, return_tensors="pt")
visual_embeds = get_visual_embeddings(image).unsqueeze(0) visual_embeds = get_visual_embeddings(image).unsqueeze(0)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2])) region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]))
inputs.update({ inputs.update(
"region_to_phrase_position": region_to_phrase_position, {
"visual_embeds": visual_embeds, "region_to_phrase_position": region_to_phrase_position,
"visual_token_type_ids": visual_token_type_ids, "visual_embeds": visual_embeds,
"visual_attention_mask": visual_attention_mask "visual_token_type_ids": visual_token_type_ids,
}) "visual_attention_mask": visual_attention_mask,
}
)
labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1 labels = torch.ones(
(1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2], visual_embeds.shape[-2])
) # Batch size 1
outputs = model(**inputs, labels=labels) outputs = model(**inputs, labels=labels)
loss = outputs.loss loss = outputs.loss
......
...@@ -521,11 +521,11 @@ FLAX_VISION_MODEL_DOCSTRING = """ ...@@ -521,11 +521,11 @@ FLAX_VISION_MODEL_DOCSTRING = """
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = FlaxViTModel.from_pretrained('google/vit-base-patch16-224-in21k') >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="np") >>> inputs = feature_extractor(images=image, return_tensors="np")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -603,11 +603,11 @@ FLAX_VISION_CLASSIF_DOCSTRING = """ ...@@ -603,11 +603,11 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
>>> import jax >>> import jax
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = FlaxViTForImageClassification.from_pretrained('google/vit-base-patch16-224') >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="np") >>> inputs = feature_extractor(images=image, return_tensors="np")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
......
...@@ -682,11 +682,11 @@ class TFViTModel(TFViTPreTrainedModel): ...@@ -682,11 +682,11 @@ class TFViTModel(TFViTPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k') >>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="tf") >>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -803,11 +803,11 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification ...@@ -803,11 +803,11 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224') >>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="tf") >>> inputs = feature_extractor(images=image, return_tensors="tf")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
......
...@@ -522,11 +522,11 @@ class ViTModel(ViTPreTrainedModel): ...@@ -522,11 +522,11 @@ class ViTModel(ViTPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') >>> model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -634,11 +634,11 @@ class ViTForImageClassification(ViTPreTrainedModel): ...@@ -634,11 +634,11 @@ class ViTForImageClassification(ViTPreTrainedModel):
>>> from PIL import Image >>> from PIL import Image
>>> import requests >>> import requests
>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> image = Image.open(requests.get(url, stream=True).raw)
>>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
>>> model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') >>> model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
>>> inputs = feature_extractor(images=image, return_tensors="pt") >>> inputs = feature_extractor(images=image, return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
......
...@@ -952,15 +952,19 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """ ...@@ -952,15 +952,19 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
>>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech ... batch["speech"] = speech
>>> return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values # Batch size 1 >>> input_values = processor(
... ds["speech"][0], sampling_rate=16_000, return_tensors="np"
>>> ).input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state >>> hidden_states = model(input_values).last_hidden_state
``` ```
""" """
...@@ -1055,15 +1059,19 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """ ...@@ -1055,15 +1059,19 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
>>> def map_to_array(batch): >>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech ... batch["speech"] = speech
>>> return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values # Batch size 1 >>> input_values = processor(
... ds["speech"][0], sampling_rate=16_000, return_tensors="np"
>>> ).input_values # Batch size 1
>>> logits = model(input_values).logits >>> logits = model(input_values).logits
>>> predicted_ids = jnp.argmax(logits, axis=-1) >>> predicted_ids = jnp.argmax(logits, axis=-1)
...@@ -1264,9 +1272,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """ ...@@ -1264,9 +1272,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
>>> outputs = model(input_values, mask_time_indices=mask_time_indices) >>> outputs = model(input_values, mask_time_indices=mask_time_indices)
>>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
>>> cosine_sim = optax.cosine_similarity( >>> cosine_sim = optax.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states)
... outputs.projected_states, outputs.projected_quantized_states
... )
>>> # show that cosine similarity is much higher than random >>> # show that cosine similarity is much higher than random
>>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5 >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
......
...@@ -1408,10 +1408,12 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): ...@@ -1408,10 +1408,12 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech ... batch["speech"] = speech
>>> return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
...@@ -1519,15 +1521,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ...@@ -1519,15 +1521,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech ... batch["speech"] = speech
>>> return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
>>> logits = model(input_values).logits >>> logits = model(input_values).logits
>>> predicted_ids = tf.argmax(logits, axis=-1) >>> predicted_ids = tf.argmax(logits, axis=-1)
...@@ -1538,7 +1542,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): ...@@ -1538,7 +1542,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
>>> # wrap processor as target processor to encode labels >>> # wrap processor as target processor to encode labels
>>> with processor.as_target_processor(): >>> with processor.as_target_processor():
>>> labels = processor(transcription, return_tensors="tf").input_ids ... labels = processor(transcription, return_tensors="tf").input_ids
>>> loss = model(input_values, labels=labels).loss >>> loss = model(input_values, labels=labels).loss
```""" ```"""
......
...@@ -1421,9 +1421,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): ...@@ -1421,9 +1421,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
... outputs = model(input_values, mask_time_indices=mask_time_indices) ... outputs = model(input_values, mask_time_indices=mask_time_indices)
>>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states) >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
>>> cosine_sim = torch.cosine_similarity( >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
... outputs.projected_states, outputs.projected_quantized_states, dim=-1
... )
>>> # show that cosine similarity is much higher than random >>> # show that cosine similarity is much higher than random
>>> assert cosine_sim[mask_time_indices].mean() > 0.5 >>> assert cosine_sim[mask_time_indices].mean() > 0.5
...@@ -1568,10 +1566,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): ...@@ -1568,10 +1566,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
>>> speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
>>> batch["speech"] = speech ... batch["speech"] = speech
>>> return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
......
...@@ -298,11 +298,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): ...@@ -298,11 +298,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
```python ```python
# Let's see how to increase the vocabulary of Bert model and tokenizer # Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h') tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h') model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print('We have added', num_added_toks, 'tokens') print("We have added", num_added_toks, "tokens")
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
```""" ```"""
......
...@@ -370,11 +370,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): ...@@ -370,11 +370,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
```python ```python
# Let's see how to increase the vocabulary of Bert model and tokenizer # Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft') tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft') model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
print('We have added', num_added_toks, 'tokens') print("We have added", num_added_toks, "tokens")
# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(len(tokenizer))
```""" ```"""
......
...@@ -1042,10 +1042,12 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -1042,10 +1042,12 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
>>> from transformers import XLMTokenizer, XLMForQuestionAnswering >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
>>> import torch >>> import torch
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') >>> tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
>>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') >>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
... 0
>>> ) # Batch size 1
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) >>> end_positions = torch.tensor([3])
......
...@@ -46,8 +46,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder): ...@@ -46,8 +46,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
>>> import torch >>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone') >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -69,8 +69,10 @@ class XLMProphetNetDecoder(ProphetNetDecoder): ...@@ -69,8 +69,10 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
>>> import torch >>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False) >>> model = XLMProphetNetDecoder.from_pretrained(
... "patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False
... )
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -91,10 +93,12 @@ class XLMProphetNetModel(ProphetNetModel): ...@@ -91,10 +93,12 @@ class XLMProphetNetModel(ProphetNetModel):
```python ```python
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetModel.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> model = XLMProphetNetModel.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
...@@ -115,10 +119,12 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration): ...@@ -115,10 +119,12 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
```python ```python
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> input_ids = tokenizer(
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
>>> ).input_ids # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
...@@ -140,8 +146,8 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM): ...@@ -140,8 +146,8 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
>>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
>>> import torch >>> import torch
>>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = XLMProphetNetForCausalLM.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> model = XLMProphetNetForCausalLM.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -152,14 +158,16 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM): ...@@ -152,14 +158,16 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
>>> from transformers import EncoderDecoderModel, XLMProphetNetTokenizer, XLMRobertaTokenizer >>> from transformers import EncoderDecoderModel, XLMProphetNetTokenizer, XLMRobertaTokenizer
>>> import torch >>> import torch
>>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') >>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
>>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') >>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-large", 'microsoft/xprophetnet-large-wiki100-cased') >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
... "xlm-roberta-large", "microsoft/xprophetnet-large-wiki100-cased"
... )
>>> ARTICLE = ( >>> ARTICLE = (
... "the us state department said wednesday it had received no " ... "the us state department said wednesday it had received no "
... "formal word from bolivia that it was expelling the us ambassador there " ... "formal word from bolivia that it was expelling the us ambassador there "
... "but said the charges made against him are `` baseless ." ... "but said the charges made against him are `` baseless ."
... ) ... )
>>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
>>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
......
...@@ -1321,21 +1321,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): ...@@ -1321,21 +1321,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
>>> import numpy as np >>> import numpy as np
>>> from transformers import XLNetTokenizer, TFXLNetLMHeadModel >>> from transformers import XLNetTokenizer, TFXLNetLMHeadModel
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
>>> model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased') >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet-large-cased")
>>> # We show how to setup inputs to predict a next token using a bi-directional context. >>> # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :] # We will predict the masked token >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[
... None, :
>>> ] # We will predict the masked token
>>> perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1])) >>> perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token >>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = np.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token >>> target_mapping = np.zeros(
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) ... (1, 1, input_ids.shape[1])
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32)) >>> target_mapping[
... 0, 0, -1
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] >>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(
... input_ids,
... perm_mask=tf.constant(perm_mask, dtype=tf.float32),
... target_mapping=tf.constant(target_mapping, dtype=tf.float32),
... )
>>> next_token_logits = outputs[
... 0
>>> ] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
```""" ```"""
inputs = input_processing( inputs = input_processing(
func=self.call, func=self.call,
......
...@@ -1400,31 +1400,53 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -1400,31 +1400,53 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
>>> from transformers import XLNetTokenizer, XLNetLMHeadModel >>> from transformers import XLNetTokenizer, XLNetLMHeadModel
>>> import torch >>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
>>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') >>> model = XLNetLMHeadModel.from_pretrained("xlnet-large-cased")
>>> # We show how to setup inputs to predict a next token using a bi-directional context. >>> # We show how to setup inputs to predict a next token using a bi-directional context.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token >>> input_ids = torch.tensor(
... tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
>>> ).unsqueeze(
... 0
>>> ) # We will predict the masked token
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token >>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token >>> target_mapping = torch.zeros(
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) ... (1, 1, input_ids.shape[1]), dtype=torch.float
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[
... 0, 0, -1
>>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] >>> next_token_logits = outputs[
... 0
>>> ] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
>>> # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling. >>> # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token >>> input_ids = torch.tensor(
... tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
>>> ).unsqueeze(
... 0
>>> ) # We will predict the masked token
>>> labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0) >>> labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
>>> assert labels.shape[0] == 1, 'only one word will be predicted' >>> assert labels.shape[0] == 1, "only one word will be predicted"
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training >>> perm_mask[
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token ... :, :, -1
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) >>> ] = 1.0 # Previous tokens don't see last token as is done in standard auto-regressive lm training
>>> target_mapping = torch.zeros(
... (1, 1, input_ids.shape[1]), dtype=torch.float
>>> ) # Shape [1, 1, seq_length] => let's predict one token
>>> target_mapping[
... 0, 0, -1
>>> ] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels) >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
>>> loss = outputs.loss >>> loss = outputs.loss
>>> next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] >>> next_token_logits = (
... outputs.logits
>>> ) # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
```""" ```"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -1968,10 +1990,12 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1968,10 +1990,12 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
>>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
>>> import torch >>> import torch
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
>>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet-base-cased")
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
... 0
>>> ) # Batch size 1
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) >>> end_positions = torch.tensor([3])
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment