example.py 1.08 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
from PIL import Image

Rayyyyy's avatar
Rayyyyy committed
3
from sentence_transformers import SentenceTransformer, models, util
Rayyyyy's avatar
Rayyyyy committed
4
5
6
7
8

###########

image = Image.open("two_dogs_in_snow.jpg")

Rayyyyy's avatar
Rayyyyy committed
9
from transformers import CLIPModel, CLIPProcessor
Rayyyyy's avatar
Rayyyyy committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
# vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
# image_embeds = model.visual_projection(vision_outputs[1])

# print(image_embeds.shape)
# exit()


# Load CLIP model
clip = models.CLIPModel()
model = SentenceTransformer(modules=[clip])

model.save("tmp-clip-model")

model = SentenceTransformer("tmp-clip-model")

# Encode an image:
img_emb = model.encode(Image.open("two_dogs_in_snow.jpg"))

# Encode text descriptions
text_emb = model.encode(["Two dogs in the snow", "A cat on a table", "A picture of London at night"])

# Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)