""" This is a simple application for sentence embeddings: semantic search We have a corpus with various sentences. Then, for a given query sentence, we want to find the most similar sentence in this corpus. This script outputs for various queries the top 5 most similar sentences in the corpus. """ import torch from sentence_transformers import SentenceTransformer embedder = SentenceTransformer("all-MiniLM-L6-v2") # Corpus with example sentences corpus = [ "A man is eating food.", "A man is eating a piece of bread.", "The girl is carrying a baby.", "A man is riding a horse.", "A woman is playing violin.", "Two men pushed carts through the woods.", "A man is riding a white horse on an enclosed ground.", "A monkey is playing drums.", "A cheetah is running behind its prey.", ] # Use "convert_to_tensor=True" to keep the tensors on GPU (if available) corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True) # Query sentences: queries = [ "A man is eating pasta.", "Someone in a gorilla costume is playing a set of drums.", "A cheetah chases prey on across a field.", ] # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity top_k = min(5, len(corpus)) for query in queries: query_embedding = embedder.encode(query, convert_to_tensor=True) # We use cosine-similarity and torch.topk to find the highest 5 scores similarity_scores = embedder.similarity(query_embedding, corpus_embeddings)[0] scores, indices = torch.topk(similarity_scores, k=top_k) print("\nQuery:", query) print("Top 5 most similar sentences in corpus:") for score, idx in zip(scores, indices): print(corpus[idx], "(Score: {:.4f})".format(score)) """ # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5) hits = hits[0] #Get the hits for the first query for hit in hits: print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) """