Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9e290804
Unverified
Commit
9e290804
authored
Oct 12, 2022
by
NielsRogge
Committed by
GitHub
Oct 12, 2022
Browse files
[X-CLIP] Fix doc tests (#19523)
* Fix XCLIP doc tests * Add model to doc test list * Fix tests
parent
eefcecaa
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
111 additions
and
31 deletions
+111
-31
src/transformers/models/auto/processing_auto.py
src/transformers/models/auto/processing_auto.py
+1
-1
src/transformers/models/x_clip/modeling_x_clip.py
src/transformers/models/x_clip/modeling_x_clip.py
+108
-29
tests/models/x_clip/test_modeling_x_clip.py
tests/models/x_clip/test_modeling_x_clip.py
+1
-1
utils/documentation_tests.txt
utils/documentation_tests.txt
+1
-0
No files found.
src/transformers/models/auto/processing_auto.py
View file @
9e290804
...
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
...
@@ -62,7 +62,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
(
"wav2vec2_with_lm"
,
"Wav2Vec2ProcessorWithLM"
),
(
"wav2vec2_with_lm"
,
"Wav2Vec2ProcessorWithLM"
),
(
"wavlm"
,
"Wav2Vec2Processor"
),
(
"wavlm"
,
"Wav2Vec2Processor"
),
(
"whisper"
,
"WhisperProcessor"
),
(
"whisper"
,
"WhisperProcessor"
),
(
"xclip"
,
"CLIPProcessor"
),
(
"xclip"
,
"
X
CLIPProcessor"
),
]
]
)
)
...
...
src/transformers/models/x_clip/modeling_x_clip.py
View file @
9e290804
...
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
...
@@ -1061,21 +1061,46 @@ class XCLIPVisionModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from
PIL
import
Image
>>> from
decord
import
VideoReader, cpu
>>> import
requests
>>> import
torch
>>>
from transformers import XCLIPProcessor, XCLIPVisionModel
>>>
import numpy as np
>>> from transformers import AutoProcessor, XCLIPVisionModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
>>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)
>>> outputs = model(
**input
s)
>>> outputs = model(
pixel_value
s)
>>> last_hidden_state = outputs.last_hidden_state
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output # pooled CLS states
```"""
```"""
return
self
.
vision_model
(
return
self
.
vision_model
(
pixel_values
=
pixel_values
,
pixel_values
=
pixel_values
,
...
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1288,10 +1313,10 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from transformers import
CLIP
Tokenizer,
XCLIP
Model
>>> from transformers import
Auto
Tokenizer,
Auto
Model
>>>
model = XCLIPModel
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = AutoTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
tokenizer = CLIPTokenizer
.from_pretrained("microsoft/xclip-base-patch32")
>>>
model = AutoModel
.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
>>> text_features = model.get_text_features(**inputs)
...
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1334,17 +1359,40 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from PIL import Image
>>> from decord import VideoReader, cpu
>>> import requests
>>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(videos=list(video), return_tensors="pt")
>>> video_features = model.get_video_features(**inputs)
>>> video_features = model.get_video_features(**inputs)
```"""
```"""
...
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
...
@@ -1399,23 +1447,54 @@ class XCLIPModel(XCLIPPreTrainedModel):
Examples:
Examples:
```python
```python
>>> from PIL import Image
>>> from decord import VideoReader, cpu
>>> import requests
>>> import torch
>>> from transformers import XCLIPProcessor, XCLIPModel
>>> import numpy as np
>>> from transformers import AutoProcessor, AutoModel
>>> from huggingface_hub import hf_hub_download
>>> np.random.seed(0)
>>> model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
>>> processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
... converted_len = int(clip_len * frame_sample_rate)
... end_idx = np.random.randint(converted_len, seg_len)
... start_idx = end_idx - converted_len
... indices = np.linspace(start_idx, end_idx, num=clip_len)
... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
... return indices
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
>>> # sample 16 frames
>>> vr.seek(0)
>>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=len(vr))
>>> video = vr.get_batch(indices).asnumpy()
>>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
>>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
>>> inputs = processor(
>>> inputs = processor(
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... text=["playing sports", "eating spaghetti", "go shopping"],
... videos=list(video),
... return_tensors="pt",
... padding=True,
... )
... )
>>> outputs = model(**inputs)
>>> # forward pass
>>> with torch.no_grad():
... outputs = model(**inputs)
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> logits_per_video = outputs.logits_per_video # this is the video-text similarity score
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> probs = logits_per_video.softmax(dim=1) # we can take the softmax to get the label probabilities
>>> print(probs)
tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
```"""
```"""
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
# Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
output_attentions
=
output_attentions
if
output_attentions
is
not
None
else
self
.
config
.
output_attentions
...
...
tests/models/x_clip/test_modeling_x_clip.py
View file @
9e290804
...
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
...
@@ -667,6 +667,6 @@ class XCLIPModelIntegrationTest(unittest.TestCase):
torch
.
Size
((
inputs
.
input_ids
.
shape
[
0
],
inputs
.
pixel_values
.
shape
[
0
])),
torch
.
Size
((
inputs
.
input_ids
.
shape
[
0
],
inputs
.
pixel_values
.
shape
[
0
])),
)
)
expected_logits
=
torch
.
tensor
([[
14.
3
81
9
,
20.
603
1
,
1
5.052
6
]],
device
=
torch_device
)
expected_logits
=
torch
.
tensor
([[
14.
01
81
,
20.
277
1
,
1
4.477
6
]],
device
=
torch_device
)
self
.
assertTrue
(
torch
.
allclose
(
outputs
.
logits_per_video
,
expected_logits
,
atol
=
1e-3
))
self
.
assertTrue
(
torch
.
allclose
(
outputs
.
logits_per_video
,
expected_logits
,
atol
=
1e-3
))
utils/documentation_tests.txt
View file @
9e290804
...
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
...
@@ -102,3 +102,4 @@ src/transformers/models/wavlm/modeling_wavlm.py
src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/whisper/modeling_tf_whisper.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/yolos/modeling_yolos.py
src/transformers/models/x_clip/modeling_x_clip.py
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment