Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9e290804
Unverified
Commit
9e290804
authored
Oct 12, 2022
by
NielsRogge
Committed by
GitHub
Oct 12, 2022
Browse files
[X-CLIP] Fix doc tests (#19523)
* Fix XCLIP doc tests * Add model to doc test list * Fix tests
parent
eefcecaa
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
111 additions
and
31 deletions
+111
-31
src/transformers/models/auto/processing_auto.py
src/transformers/models/auto/processing_auto.py
+1
-1
src/transformers/models/x_clip/modeling_x_clip.py
src/transformers/models/x_clip/modeling_x_clip.py
+108
-29
tests/models/x_clip/test_modeling_x_clip.py
tests/models/x_clip/test_modeling_x_clip.py
+1
-1
utils/documentation_tests.txt
utils/documentation_tests.txt
+1
-0
No files found.
src/transformers/models/auto/processing_auto.py
View file @
9e290804
...
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
...
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
(
"wav2vec2_with_lm"
,
"Wav2Vec2ProcessorWithLM"
),
(
"wav2vec2_with_lm"
,
"Wav2Vec2ProcessorWithLM"
),
(
"wavlm"
,
"Wav2Vec2Processor"
),
(
"wavlm"
,
"Wav2Vec2Processor"
),
(
"whisper"
,
"WhisperProcessor"
),
(
"whisper"
,
"WhisperProcessor"
),
(
"xclip"
,
"CLIPProcessor"
),
(
"xclip"
,
"
X
CLIPProcessor"
),
]
]
)
)
...
...
src/transformers/models/x_clip/modeling_x_clip.py
View file @
9e290804
...
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
...
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from
PIL
import
Image
>>> from
decord
import
VideoReader, cpu
>>> import
requests
>>> import
torch
>>>
from transformers import XCLIPProcessor, XCLIPVisionModel
>>>
import numpy as np
>>> from transformers import AutoProcessor, XCLIPVisionModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
>>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)
>>> outputs = model(
**input
s)
>>> outputs = model(
pixel_value
s)
>>> last_hidden_state = outputs.last_hidden_state
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
```"""
return
self
.
vision_model
(
return
self
.
vision_model
(
pixel_values
=
pixel_values
,
pixel_values
=
pixel_values
,
...
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from transformers import
CLIP
Tokenizer,
XCLIP
Model
>>> from transformers import
Auto
Tokenizer,
Auto
Model
>>>
model = XCLIPModel
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = AutoTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = CLIPTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
model = AutoModel
.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
>>> text_features = model.get_text_features(**inputs)
...
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from PIL import Image
>>> from decord import VideoReader, cpu
>>> import requests
>>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(videos=list(video), return_tensors="pt")
>>> video_features = model.get_video_features(**inputs)
>>> video_features = model.get_video_features(**inputs)
```"""
```"""
...
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from PIL import Image
>>> from decord import VideoReader, cpu
>>> import requests
>>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... text=["playing sports", "eating spaghetti", "go shopping"],
... videos=list(video),
... return_tensors="pt",
... padding=True,
... )
... )
>>> outputs = model(**inputs)
>>> # forward pass
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs)
tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
```"""
```"""
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
...
...
tests/models/x_clip/test_modeling_x_clip.py
View file @
9e290804
...
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
...
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
torch
.
Size
((
inputs
.
input_ids
.
shape
[
0
],
inputs
.
pixel_values
.
shape
[
0
])),
torch
.
Size
((
inputs
.
input_ids
.
shape
[
0
],
inputs
.
pixel_values
.
shape
[
0
])),
)
)
expected_logits
=
torch
.
tensor
([[
14.
3
81
9
,
20.
603
1
,
1
5.052
6
]],
device
=
torch_device
)
expected_logits
=
torch
.
tensor
([[
14.
01
81
,
20.
277
1
,
1
4.477
6
]],
device
=
torch_device
)
self
.
assertTrue
(
torch
.
allclose
(
outputs
.
logits_per_video
,
expected_logits
,
atol
=
1e-3
))
self
.
assertTrue
(
torch
.
allclose
(
outputs
.
logits_per_video
,
expected_logits
,
atol
=
1e-3
))
utils/documentation_tests.txt
View file @
9e290804
...
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
...
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment