Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b5b4e549
Unverified
Commit
b5b4e549
authored
Jul 20, 2021
by
Suraj Patil
Committed by
GitHub
Jul 20, 2021
Browse files
add and fix examples (#12810)
parent
31d06729
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
102 additions
and
3 deletions
+102
-3
docs/source/model_doc/clip.rst
docs/source/model_doc/clip.rst
+0
-1
src/transformers/models/clip/modeling_clip.py
src/transformers/models/clip/modeling_clip.py
+73
-0
src/transformers/models/clip/modeling_flax_clip.py
src/transformers/models/clip/modeling_flax_clip.py
+29
-2
No files found.
docs/source/model_doc/clip.rst
View file @
b5b4e549
...
@@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t
...
@@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t
..
code
-
block
::
..
code
-
block
::
>>>
import
torch
>>>
from
PIL
import
Image
>>>
from
PIL
import
Image
>>>
import
requests
>>>
import
requests
...
...
src/transformers/models/clip/modeling_clip.py
View file @
b5b4e549
...
@@ -699,6 +699,18 @@ class CLIPTextModel(CLIPPreTrainedModel):
...
@@ -699,6 +699,18 @@ class CLIPTextModel(CLIPPreTrainedModel):
r
"""
r
"""
Returns:
Returns:
Examples::
>>> from transformers import CLIPTokenizer, CLIPTextModel
>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooled_output # pooled (EOS token) states
"""
"""
return
self
.
text_model
(
return
self
.
text_model
(
input_ids
=
input_ids
,
input_ids
=
input_ids
,
...
@@ -791,6 +803,23 @@ class CLIPVisionModel(CLIPPreTrainedModel):
...
@@ -791,6 +803,23 @@ class CLIPVisionModel(CLIPPreTrainedModel):
r
"""
r
"""
Returns:
Returns:
Examples::
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, CLIPVisionModel
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooled_output # pooled CLS states
"""
"""
return
self
.
vision_model
(
return
self
.
vision_model
(
pixel_values
=
pixel_values
,
pixel_values
=
pixel_values
,
...
@@ -847,6 +876,16 @@ class CLIPModel(CLIPPreTrainedModel):
...
@@ -847,6 +876,16 @@ class CLIPModel(CLIPPreTrainedModel):
Returns:
Returns:
text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings
text_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The text embeddings
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`.
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPTextModel`.
Examples::
>>> from transformers import CLIPTokenizer, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
"""
"""
text_outputs
=
self
.
text_model
(
text_outputs
=
self
.
text_model
(
input_ids
=
input_ids
,
input_ids
=
input_ids
,
...
@@ -874,6 +913,22 @@ class CLIPModel(CLIPPreTrainedModel):
...
@@ -874,6 +913,22 @@ class CLIPModel(CLIPPreTrainedModel):
Returns:
Returns:
image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings
image_features (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, output_dim`): The image embeddings
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`.
obtained by applying the projection layer to the pooled output of :class:`~transformers.CLIPVisionModel`.
Examples::
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
"""
"""
vision_outputs
=
self
.
vision_model
(
vision_outputs
=
self
.
vision_model
(
pixel_values
=
pixel_values
,
pixel_values
=
pixel_values
,
...
@@ -903,6 +958,24 @@ class CLIPModel(CLIPPreTrainedModel):
...
@@ -903,6 +958,24 @@ class CLIPModel(CLIPPreTrainedModel):
r
"""
r
"""
Returns:
Returns:
Examples::
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, CLIPModel
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
"""
"""
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
return_dict
return_dict
=
return_dict
if
return_dict
is
not
None
else
self
.
config
.
return_dict
vision_outputs
=
self
.
vision_model
(
vision_outputs
=
self
.
vision_model
(
...
...
src/transformers/models/clip/modeling_flax_clip.py
View file @
b5b4e549
...
@@ -803,6 +803,16 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
...
@@ -803,6 +803,16 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
Returns:
Returns:
text_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The text embeddings
text_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The text embeddings
obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`.
obtained by applying the projection layer to the pooled output of :class:`~transformers.FlaxCLIPTextModel`.
Examples::
>>> from transformers import CLIPTokenizer, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
>>> text_features = model.get_text_features(**inputs)
"""
"""
if
position_ids
is
None
:
if
position_ids
is
None
:
position_ids
=
jnp
.
broadcast_to
(
jnp
.
arange
(
jnp
.
atleast_2d
(
input_ids
).
shape
[
-
1
]),
input_ids
.
shape
)
position_ids
=
jnp
.
broadcast_to
(
jnp
.
arange
(
jnp
.
atleast_2d
(
input_ids
).
shape
[
-
1
]),
input_ids
.
shape
)
...
@@ -848,6 +858,22 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
...
@@ -848,6 +858,22 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
image_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The image embeddings
image_features (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, output_dim`): The image embeddings
obtained by applying the projection layer to the pooled output of
obtained by applying the projection layer to the pooled output of
:class:`~transformers.FlaxCLIPVisionModel`
:class:`~transformers.FlaxCLIPVisionModel`
Examples::
>>> from PIL import Image
>>> import requests
>>> from transformers import CLIPProcessor, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="np")
>>> image_features = model.get_image_features(**inputs)
"""
"""
pixel_values
=
jnp
.
transpose
(
pixel_values
,
(
0
,
2
,
3
,
1
))
pixel_values
=
jnp
.
transpose
(
pixel_values
,
(
0
,
2
,
3
,
1
))
...
@@ -907,6 +933,7 @@ FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
...
@@ -907,6 +933,7 @@ FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
Returns:
Returns:
Example::
Example::
>>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
>>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
>>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
...
@@ -957,9 +984,9 @@ FLAX_CLIP_VISION_MODEL_DOCSTRING = """
...
@@ -957,9 +984,9 @@ FLAX_CLIP_VISION_MODEL_DOCSTRING = """
Returns:
Returns:
Example::
Example::
>>> from PIL import Image
>>> from PIL import Image
>>> import requests
>>> import requests
>>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
>>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
>>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
...
@@ -1078,10 +1105,10 @@ FLAX_CLIP_MODEL_DOCSTRING = """
...
@@ -1078,10 +1105,10 @@ FLAX_CLIP_MODEL_DOCSTRING = """
Returns:
Returns:
Example::
Example::
>>> import jax
>>> import jax
>>> from PIL import Image
>>> from PIL import Image
>>> import requests
>>> import requests
>>> from transformers import CLIPProcessor, FlaxCLIPModel
>>> from transformers import CLIPProcessor, FlaxCLIPModel
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment