example.py

from sentence_transformers import SentenceTransformer, util, models
from PIL import Image


###########

image = Image.open("two_dogs_in_snow.jpg")

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


inputs = processor(texts=["a cat", "a dog"], images=[image], return_tensors="pt", padding=True)
output = model(**inputs)
# vision_outputs = model.vision_model(pixel_values=inputs['pixel_values'])
# image_embeds = model.visual_projection(vision_outputs[1])

# print(image_embeds.shape)
# exit()


# Load CLIP model
clip = models.CLIPModel()
model = SentenceTransformer(modules=[clip])

model.save("tmp-clip-model")

model = SentenceTransformer("tmp-clip-model")

# Encode an image:
img_emb = model.encode(Image.open("two_dogs_in_snow.jpg"))

# Encode text descriptions
text_emb = model.encode(["Two dogs in the snow", "A cat on a table", "A picture of London at night"])

# Compute cosine similarities
cos_scores = util.cos_sim(img_emb, text_emb)
print(cos_scores)