Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9e290804
Unverified
Commit
9e290804
authored
Oct 12, 2022
by
NielsRogge
Committed by
GitHub
Oct 12, 2022
Browse files
[X-CLIP] Fix doc tests (#19523)
* Fix XCLIP doc tests * Add model to doc test list * Fix tests
parent
eefcecaa
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
111 additions
and
31 deletions
+111
-31
src/transformers/models/auto/processing_auto.py
src/transformers/models/auto/processing_auto.py
+1
-1
src/transformers/models/x_clip/modeling_x_clip.py
src/transformers/models/x_clip/modeling_x_clip.py
+108
-29
tests/models/x_clip/test_modeling_x_clip.py
tests/models/x_clip/test_modeling_x_clip.py
+1
-1
utils/documentation_tests.txt
utils/documentation_tests.txt
+1
-0
No files found.
src/transformers/models/auto/processing_auto.py
View file @
9e290804
...
...
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
(
"wav2vec2_with_lm"
,
"Wav2Vec2ProcessorWithLM"
),
(
"wavlm"
,
"Wav2Vec2Processor"
),
(
"whisper"
,
"WhisperProcessor"
),
(
"xclip"
,
"CLIPProcessor"
),
(
"xclip"
,
"
X
CLIPProcessor"
),
]
)
...
...
src/transformers/models/x_clip/modeling_x_clip.py
View file @
9e290804
...
...
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
Examples:
```python
>>> from
PIL
import
Image
>>> import
requests
>>>
from transformers import XCLIPProcessor, XCLIPVisionModel
>>> from
decord
import
VideoReader, cpu
>>> import
torch
>>>
import numpy as np
>>> from transformers import AutoProcessor, XCLIPVisionModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
>>> inputs = processor(images=image, return_tensors="pt")
>>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
>>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)
>>> outputs = model(
**input
s)
>>> outputs = model(
pixel_value
s)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
return
self
.
vision_model
(
pixel_values
=
pixel_values
,
...
...
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
```python
>>> from transformers import
CLIP
Tokenizer,
XCLIP
Model
>>> from transformers import
Auto
Tokenizer,
Auto
Model
>>>
model = XCLIPModel
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = CLIPTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = AutoTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
model = AutoModel
.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
...
...
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> from decord import VideoReader, cpu
>>> import torch
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(videos=list(video), return_tensors="pt")
>>> video_features = model.get_video_features(**inputs)
```"""
...
...
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> from decord import VideoReader, cpu
>>> import torch
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... text=["playing sports", "eating spaghetti", "go shopping"],
... videos=list(video),
... return_tensors="pt",
... padding=True,
... )
>>> outputs = model(**inputs)
>>> # forward pass
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs)
tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
```"""
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
...
...
tests/models/x_clip/test_modeling_x_clip.py
View file @
9e290804
...
...
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
torch
.
Size
((
inputs
.
input_ids
.
shape
[
0
],
inputs
.
pixel_values
.
shape
[
0
])),
)
expected_logits
=
torch
.
tensor
([[
14.
3
81
9
,
20.
603
1
,
1
5.052
6
]],
device
=
torch_device
)
expected_logits
=
torch
.
tensor
([[
14.
01
81
,
20.
277
1
,
1
4.477
6
]],
device
=
torch_device
)
self
.
assertTrue
(
torch
.
allclose
(
outputs
.
logits_per_video
,
expected_logits
,
atol
=
1e-3
))
utils/documentation_tests.txt
View file @
9e290804
...
...
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment