Fix VisualBERT docs (#13106)

* Fix VisualBERT docs * Show example notebooks as lists * Fix style

Fix VisualBERT docs (#13106)
* Fix VisualBERT docs * Show example notebooks as lists * Fix style
bda1cb02 · Gunjan Chhablani · GitHub · e46ad22c · bda1cb02 · bda1cb02
Unverified Commit bda1cb02 authored Aug 13, 2021 by Gunjan Chhablani Committed by GitHub Aug 13, 2021
Showing with 40 additions and 25 deletions

docs/source/model_doc/visual_bert.rst docs/source/model_doc/visual_bert.rst +18 -3

src/transformers/models/visual_bert/modeling_visual_bert.py src/transformers/models/visual_bert/modeling_visual_bert.py +22 -22

No files found.
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.
 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:
 .. code-block::
@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
        >>> # this is a custom function that returns the visual embeddings given the image path
        >>> visual_embeds = get_visual_embeddings(image_path)
+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state

--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel):
            >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> outputs = model(**inputs)
@@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
            >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> max_length  = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
            >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
            >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
@@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
            >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
            >>> # batch size is 1
-            >>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}}
+            >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
-            >>> inputs_dict.update({{
+            >>> inputs_dict.update({
-            ... visual_embeds=visual_embeds,
+            ...     "visual_embeds": visual_embeds,
-            ... visual_attention_mask=visual_attention_mask,
+            ...     "visual_attention_mask": visual_attention_mask,
-            ... visual_token_type_ids=visual_token_type_ids,
+            ...     "visual_token_type_ids": visual_token_type_ids,
-            ... labels=labels
+            ...     "labels": labels
-            ... }})
+            ... })
            >>> outputs = model(**inputs_dict)
            >>> loss = outputs.loss
@@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2
@@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
@@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
            >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "region_to_phrase_position": region_to_phrase_position,
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1