Unverified Commit bda1cb02 authored by Gunjan Chhablani's avatar Gunjan Chhablani Committed by GitHub
Browse files

Fix VisualBERT docs (#13106)

* Fix VisualBERT docs

* Show example notebooks as lists

* Fix style
parent e46ad22c
...@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The ...@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
appropriately for the textual and visual parts. appropriately for the textual and visual parts.
The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
how to get the last hidden state using :class:`~transformers.VisualBertModel`: * `VisualBERT VQA demo notebook
<https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
contains an example on VisualBERT VQA.
* `Generate Embeddings for VisualBERT (Colab Notebook)
<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
an example on how to generate visual embeddings.
The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:
.. code-block:: .. code-block::
...@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`: ...@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
>>> # this is a custom function that returns the visual embeddings given the image path >>> # this is a custom function that returns the visual embeddings given the image path
>>> visual_embeds = get_visual_embeddings(image_path) >>> visual_embeds = get_visual_embeddings(image_path)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({
... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask
... })
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state >>> last_hidden_state = outputs.last_hidden_state
......
...@@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel): ...@@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel):
>>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt") >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({{ >>> inputs.update({
... "visual_embeds": visual_embeds, ... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask ... "visual_attention_mask": visual_attention_mask
... }}) ... })
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel): ...@@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
>>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({{ >>> inputs.update({
... "visual_embeds": visual_embeds, ... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask ... "visual_attention_mask": visual_attention_mask
... }}) ... })
>>> max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2] >>> max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"] >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
>>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
...@@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel): ...@@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True) >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
>>> # batch size is 1 >>> # batch size is 1
>>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}} >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
>>> inputs_dict.update({{ >>> inputs_dict.update({
... visual_embeds=visual_embeds, ... "visual_embeds": visual_embeds,
... visual_attention_mask=visual_attention_mask, ... "visual_attention_mask": visual_attention_mask,
... visual_token_type_ids=visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... labels=labels ... "labels": labels
... }}) ... })
>>> outputs = model(**inputs_dict) >>> outputs = model(**inputs_dict)
>>> loss = outputs.loss >>> loss = outputs.loss
...@@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel): ...@@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
>>> text = "Who is eating the apple?" >>> text = "Who is eating the apple?"
>>> inputs = tokenizer(text, return_tensors='pt') >>> inputs = tokenizer(text, return_tensors='pt')
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({{ >>> inputs.update({
... "visual_embeds": visual_embeds, ... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask ... "visual_attention_mask": visual_attention_mask
... }}) ... })
>>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2 >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
...@@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel): ...@@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
>>> text = "Who is eating the apple?" >>> text = "Who is eating the apple?"
>>> inputs = tokenizer(text, return_tensors='pt') >>> inputs = tokenizer(text, return_tensors='pt')
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({{ >>> inputs.update({
... "visual_embeds": visual_embeds, ... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask ... "visual_attention_mask": visual_attention_mask
... }}) ... })
>>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2 >>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
...@@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): ...@@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
>>> text = "Who is eating the apple?" >>> text = "Who is eating the apple?"
>>> inputs = tokenizer(text, return_tensors='pt') >>> inputs = tokenizer(text, return_tensors='pt')
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2])) >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
>>> inputs.update({{ >>> inputs.update({
... "region_to_phrase_position": region_to_phrase_position, ... "region_to_phrase_position": region_to_phrase_position,
... "visual_embeds": visual_embeds, ... "visual_embeds": visual_embeds,
... "visual_token_type_ids": visual_token_type_ids, ... "visual_token_type_ids": visual_token_type_ids,
... "visual_attention_mask": visual_attention_mask ... "visual_attention_mask": visual_attention_mask
... }}) ... })
>>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1 >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment