[X-CLIP] Fix doc tests (#19523)

* Fix XCLIP doc tests * Add model to doc test list * Fix tests

[X-CLIP] Fix doc tests (#19523)
* Fix XCLIP doc tests * Add model to doc test list * Fix tests
9e290804 · NielsRogge · GitHub · eefcecaa · 9e290804 · 9e290804
Unverified Commit 9e290804 authored Oct 12, 2022 by NielsRogge Committed by GitHub Oct 12, 2022
4 changed files
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
        ("wavlm", "Wav2Vec2Processor"),
        ("whisper", "WhisperProcessor"),
-        ("xclip", "CLIPProcessor"),
+        ("xclip", "XCLIPProcessor"),
    ]
 )


--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
        Examples:

        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import XCLIPProcessor, XCLIPVisionModel
+        >>> from decord import VideoReader, cpu
+        >>> import torch
+        >>> import numpy as np

+        >>> from transformers import AutoProcessor, XCLIPVisionModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+
+        >>> # sample 16 frames
+        >>> vr.seek(0)
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
+        >>> video = vr.get_batch(indices).asnumpy()
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
        >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values

-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        >>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)

-        >>> outputs = model(**inputs)
+        >>> outputs = model(pixel_values)
        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""
        return self.vision_model(
            pixel_values=pixel_values,
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
        Examples:

        ```python
-        >>> from transformers import CLIPTokenizer, XCLIPModel
+        >>> from transformers import AutoTokenizer, AutoModel

-        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
        Examples:

        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import XCLIPProcessor, XCLIPModel
+        >>> from decord import VideoReader, cpu
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+

-        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+
+        >>> # sample 16 frames
+        >>> vr.seek(0)
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
+        >>> video = vr.get_batch(indices).asnumpy()
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = processor(videos=list(video), return_tensors="pt")

        >>> video_features = model.get_video_features(**inputs)
        ```"""
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
        Examples:

        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import XCLIPProcessor, XCLIPModel
+        >>> from decord import VideoReader, cpu
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+

-        >>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        >>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices

-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+
+        >>> # sample 16 frames
+        >>> vr.seek(0)
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
+        >>> video = vr.get_batch(indices).asnumpy()
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

        >>> inputs = processor(
-        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ...     text=["playing sports", "eating spaghetti", "go shopping"],
+        ...     videos=list(video),
+        ...     return_tensors="pt",
+        ...     padding=True,
        ... )

-        >>> outputs = model(**inputs)
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
        >>> logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
        >>> probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        >>> print(probs)
+        tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
        ```"""
        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
            torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
        )

-        expected_logits = torch.tensor([[14.3819, 20.6031, 15.0526]], device=torch_device)
+        expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device)

        self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3))
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
 src/transformers/models/whisper/modeling_whisper.py
 src/transformers/models/whisper/modeling_tf_whisper.py
 src/transformers/models/yolos/modeling_yolos.py
+src/transformers/models/x_clip/modeling_x_clip.py