Unverified Commit 9e290804 authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

[X-CLIP] Fix doc tests (#19523)

* Fix XCLIP doc tests

* Add model to doc test list

* Fix tests
parent eefcecaa
...@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ...@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"), ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
("wavlm", "Wav2Vec2Processor"), ("wavlm", "Wav2Vec2Processor"),
("whisper", "WhisperProcessor"), ("whisper", "WhisperProcessor"),
("xclip", "CLIPProcessor"), ("xclip", "XCLIPProcessor"),
] ]
) )
......
...@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel): ...@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
Examples: Examples:
```python ```python
>>> from PIL import Image >>> from decord import VideoReader, cpu
>>> import requests >>> import torch
>>> from transformers import XCLIPProcessor, XCLIPVisionModel >>> import numpy as np
>>> from transformers import AutoProcessor, XCLIPVisionModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32") >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt") >>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
>>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)
>>> outputs = model(**inputs) >>> outputs = model(pixel_values)
>>> last_hidden_state = outputs.last_hidden_state >>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```""" ```"""
return self.vision_model( return self.vision_model(
pixel_values=pixel_values, pixel_values=pixel_values,
...@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel): ...@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples: Examples:
```python ```python
>>> from transformers import CLIPTokenizer, XCLIPModel >>> from transformers import AutoTokenizer, AutoModel
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("microsoft/xclip-base-patch32") >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs) >>> text_features = model.get_text_features(**inputs)
...@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel): ...@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples: Examples:
```python ```python
>>> from PIL import Image >>> from decord import VideoReader, cpu
>>> import requests >>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel >>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32") ... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt") >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(videos=list(video), return_tensors="pt")
>>> video_features = model.get_video_features(**inputs) >>> video_features = model.get_video_features(**inputs)
```""" ```"""
...@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel): ...@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples: Examples:
```python ```python
>>> from PIL import Image >>> from decord import VideoReader, cpu
>>> import requests >>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel >>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32") >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32") ... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw) >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor( >>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True ... text=["playing sports", "eating spaghetti", "go shopping"],
... videos=list(video),
... return_tensors="pt",
... padding=True,
... ) ... )
>>> outputs = model(**inputs) >>> # forward pass
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score >>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities >>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs)
tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
```""" ```"""
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components. # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
......
...@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase): ...@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])), torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
) )
expected_logits = torch.tensor([[14.3819, 20.6031, 15.0526]], device=torch_device) expected_logits = torch.tensor([[14.0181, 20.2771, 14.4776]], device=torch_device)
self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3)) self.assertTrue(torch.allclose(outputs.logits_per_video, expected_logits, atol=1e-3))
...@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py ...@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
src/transformers/models/whisper/modeling_whisper.py src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/yolos/modeling_yolos.py src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment