Commit 0fccd232 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add

parents
Pipeline #1027 failed with stages
in 0 seconds
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Corpus with example sentences
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"A man is eating pasta.",
"The girl is carrying a baby.",
"The baby is carried by the woman",
"A man is riding a horse.",
"A man is riding a white horse on an enclosed ground.",
"A monkey is playing drums.",
"Someone in a gorilla costume is playing a set of drums.",
"A cheetah is running behind its prey.",
"A cheetah chases prey on across a field.",
]
corpus_embeddings = embedder.encode(corpus)
# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
# Perform kmean clustering
clustering_model = AgglomerativeClustering(
n_clusters=None, distance_threshold=1.5
) # , affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
if cluster_id not in clustered_sentences:
clustered_sentences[cluster_id] = []
clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in clustered_sentences.items():
print("Cluster ", i + 1)
print(cluster)
print("")
"""
This is a more complex example on performing clustering on large scale dataset.
This examples find in a large set of sentences local communities, i.e., groups of sentences that are highly
similar. You can freely configure the threshold what is considered as similar. A high threshold will
only find extremely similar sentences, a lower threshold will find more sentence that are less similar.
A second parameter is 'min_community_size': Only communities with at least a certain number of sentences will be returned.
The method for finding the communities is extremely fast, for clustering 50k sentences it requires only 5 seconds (plus embedding comuptation).
In this example, we download a large set of questions from Quora and then find similar questions in this set.
"""
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time
# Model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer("all-MiniLM-L6-v2")
# We download the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
# and find similar question in it
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 50000 # We limit our corpus to only the first 50k questions
# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
print("Download dataset")
util.http_get(url, dataset_path)
# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
corpus_sentences.add(row["question1"])
corpus_sentences.add(row["question2"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
print("Start clustering")
start_time = time.time()
# Two parameters to tune:
# min_cluster_size: Only consider cluster that have at least 25 elements
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)
print("Clustering done after {:.2f} sec".format(time.time() - start_time))
# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
print("\nCluster {}, #{} Elements ".format(i + 1, len(cluster)))
for sentence_id in cluster[0:3]:
print("\t", corpus_sentences[sentence_id])
print("\t", "...")
for sentence_id in cluster[-3:]:
print("\t", corpus_sentences[sentence_id])
"""
This is a simple application for sentence embeddings: clustering
Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# Corpus with example sentences
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"A man is eating pasta.",
"The girl is carrying a baby.",
"The baby is carried by the woman",
"A man is riding a horse.",
"A man is riding a white horse on an enclosed ground.",
"A monkey is playing drums.",
"Someone in a gorilla costume is playing a set of drums.",
"A cheetah is running behind its prey.",
"A cheetah chases prey on across a field.",
]
corpus_embeddings = embedder.encode(corpus)
# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])
for i, cluster in enumerate(clustered_sentences):
print("Cluster ", i + 1)
print(cluster)
print("")
# Computing Sentence Embeddings
The basic function to compute sentence embeddings looks like this:
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
# Our sentences we like to encode
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of strings.",
"The quick brown fox jumps over the lazy dog.",
]
# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)
# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
print("Sentence:", sentence)
print("Embedding:", embedding)
print("")
```
**Note:** Even though we talk about sentence embeddings, you can use it also for shorter phrases as well as for longer texts with multiple sentences. See the section on Input Sequence Length for more notes on embeddings for paragraphs.
First, we load a sentence-transformer model:
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("model_name_or_path")
```
You can either specify a [pre-trained model](https://www.sbert.net/docs/pretrained_models.html) or you can pass a path on your disc to load the sentence-transformer model from that folder.
If available, the model is automatically executed on the GPU. You can specify the device for the model like this:
```python
model = SentenceTransformer("model_name_or_path", device="cuda")
```
With *device* any pytorch device (like CPU, cuda, cuda:0 etc.)
The relevant method to encode a set of sentences / texts is `model.encode()`. In the following, you can find parameters this method accepts. Some relevant parameters are *batch_size* (depending on your GPU a different batch size is optimal) as well as *convert_to_numpy* (returns a numpy matrix) and *convert_to_tensor* (returns a pytorch tensor).
```eval_rst
.. autoclass:: sentence_transformers.SentenceTransformer
:members: encode
```
## Prompt Templates
Some models require using specific text *prompts* to achieve optimal performance. For example, with [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) you should prefix all queries with `query: ` and all passages with `passage: `. Another example is [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5), which performs best for retrieval when the input texts are prefixed with `Represent this sentence for searching relevant passages: `.
Sentence Transformer models can be initialized with `prompts` and `default_prompt_name` parameters:
* `prompts` is an optional argument that accepts a dictionary of prompts with prompt names to prompt texts. The prompt will be prepended to the input text during inference. For example,
```python
model = SentenceTransformer(
"intfloat/multilingual-e5-large",
prompts={
"classification": "Classify the following text: ",
"retrieval": "Retrieve semantically similar text: ",
"clustering": "Identify the topic or theme based on the text: ",
},
)
# or
model.prompts = {
"classification": "Classify the following text: ",
"retrieval": "Retrieve semantically similar text: ",
"clustering": "Identify the topic or theme based on the text: ",
}
```
* `default_prompt_name` is an optional argument that determines the default prompt to be used. It has to correspond with a prompt name from `prompts`. If `None`, then no prompt is used by default. For example,
```python
model = SentenceTransformer(
"intfloat/multilingual-e5-large",
prompts={
"classification": "Classify the following text: ",
"retrieval": "Retrieve semantically similar text: ",
"clustering": "Identify the topic or theme based on the text: ",
},
default_prompt_name="retrieval",
)
# or
model.default_prompt_name="retrieval"
```
Both of these parameters can also be specified in the `config_sentence_transformers.json` file of a saved model. That way, you won't have to specify these options manually when loading. When you save a Sentence Transformer model, these options will be automatically saved as well.
During inference, prompts can be applied in a few different ways. All of these scenarios result in identical texts being embedded:
1. Explicitly using the `prompt` option in `SentenceTransformer.encode`:
```python
embeddings = model.encode("How to bake a strawberry cake", prompt="Retrieve semantically similar text: ")
```
2. Explicitly using the `prompt_name` option in `SentenceTransformer.encode` by relying on the prompts loaded from a) initialization or b) the model config.
```python
embeddings = model.encode("How to bake a strawberry cake", prompt_name="retrieval")
```
3. If `prompt` nor `prompt_name` are specified in `SentenceTransformer.encode`, then the prompt specified by `default_prompt_name` will be applied. If it is `None`, then no prompt will be applied.
```python
embeddings = model.encode("How to bake a strawberry cake")
```
## Input Sequence Length
Transformer models like BERT / RoBERTa / DistilBERT etc. the runtime and the memory requirement grows quadratic with the input length. This limits transformers to inputs of certain lengths. A common value for BERT & Co. are 512 word pieces, which corresponds to about 300-400 words (for English). Longer texts than this are truncated to the first x word pieces.
By default, the provided methods use a limit of 128 word pieces, longer inputs will be truncated. You can get and set the maximal sequence length like this:
```python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Max Sequence Length:", model.max_seq_length)
# Change the length to 200
model.max_seq_length = 200
print("Max Sequence Length:", model.max_seq_length)
```
**Note:** You cannot increase the length higher than what is maximally supported by the respective transformer model. Also note that if a model was trained on short texts, the representations for long texts might not be that good.
## Storing & Loading Embeddings
The easiest method is to use *pickle* to store pre-computed embeddings on disc and to load it from disc. This can especially be useful if you need to encode large set of sentences.
```python
from sentence_transformers import SentenceTransformer
import pickle
model = SentenceTransformer("all-MiniLM-L6-v2")
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of string.",
"The quick brown fox jumps over the lazy dog.",
]
embeddings = model.encode(sentences)
# Store sentences & embeddings on disc
with open("embeddings.pkl", "wb") as fOut:
pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
# Load sentences & embeddings from disc
with open("embeddings.pkl", "rb") as fIn:
stored_data = pickle.load(fIn)
stored_sentences = stored_data["sentences"]
stored_embeddings = stored_data["embeddings"]
```
## Multi-Process / Multi-GPU Encoding
You can encode input texts with more than one GPU (or with multiple processes on a CPU machine). For an example, see: [computing_embeddings_multi_gpu.py](computing_embeddings_multi_gpu.py).
The relevant method is `start_multi_process_pool()`, which starts multiple processes that are used for encoding.
```eval_rst
.. automethod:: sentence_transformers.SentenceTransformer.start_multi_process_pool
```
## Sentence Embeddings with Transformers
Most of our pre-trained models are based on [Huggingface.co/Transformers](https://huggingface.co/transformers/) and are also hosted in the [models repository](https://huggingface.co/models) from Huggingface. It is possible to use our sentence embeddings models without installing sentence-transformers:
```python
from transformers import AutoTokenizer, AutoModel
import torch
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
# Sentences we want sentence embeddings for
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of string.",
"The quick brown fox jumps over the lazy dog.",
]
# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# Tokenize sentences
encoded_input = tokenizer(
sentences, padding=True, truncation=True, max_length=128, return_tensors="pt"
)
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
```
You can find the available models here: [https://huggingface.co/sentence-transformers](https://huggingface.co/sentence-transformers)
In the above example we add mean pooling on top of the AutoModel (which will load a BERT model). We also have models with max-pooling and where we use the CLS token. How to apply this pooling correctly, have a look at [sentence-transformers/bert-base-nli-max-tokens](https://huggingface.co/sentence-transformers/bert-base-nli-max-tokens) and [/sentence-transformers/bert-base-nli-cls-token](https://huggingface.co/sentence-transformers/bert-base-nli-cls-token).
"""
This basic example loads a pre-trained model from the web and uses it to
generate sentence embeddings for a given list of sentences.
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging
#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout
# Load pre-trained Sentence Transformer Model. It will be downloaded automatically
model = SentenceTransformer("all-MiniLM-L6-v2")
# Embed a list of sentences
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of string.",
"The quick brown fox jumps over the lazy dog.",
]
sentence_embeddings = model.encode(sentences)
# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
print("Sentence:", sentence)
print("Embedding:", embedding)
print("")
"""
This example starts multiple processes (1 per GPU), which encode
sentences in parallel. This gives a near linear speed-up
when encoding large text collections.
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == "__main__":
# Create a large list of 100k sentences
sentences = ["This is sentence {}".format(i) for i in range(100000)]
# Define the model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Start the multi-process pool on all available CUDA devices
pool = model.start_multi_process_pool()
# Compute the embeddings using the multi-process pool
emb = model.encode_multi_process(sentences, pool)
print("Embeddings computed. Shape:", emb.shape)
# Optional: Stop the processes in the pool
model.stop_multi_process_pool(pool)
"""
This example starts multiple processes (1 per GPU), which encode
sentences in parallel. This gives a near linear speed-up
when encoding large text collections.
It also demonstrates how to stream data which is helpful in case you don't
want to wait for an extremely large dataset to download, or if you want to
limit the amount of memory used. More info about dataset streaming:
https://huggingface.co/docs/datasets/stream
"""
from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == "__main__":
# Set params
data_stream_size = 16384 # Size of the data that is loaded into memory at once
chunk_size = 1024 # Size of the chunks that are sent to each process
encode_batch_size = 128 # Batch size of the model
# Load a large dataset in streaming mode. more info: https://huggingface.co/docs/datasets/stream
dataset = load_dataset("yahoo_answers_topics", split="train", streaming=True)
dataloader = DataLoader(dataset.with_format("torch"), batch_size=data_stream_size)
# Define the model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Start the multi-process pool on all available CUDA devices
pool = model.start_multi_process_pool()
for i, batch in enumerate(tqdm(dataloader)):
# Compute the embeddings using the multi-process pool
sentences = batch["best_answer"]
batch_emb = model.encode_multi_process(sentences, pool, chunk_size=chunk_size, batch_size=encode_batch_size)
print("Embeddings computed for 1 batch. Shape:", batch_emb.shape)
# Optional: Stop the processes in the pool
model.stop_multi_process_pool(pool)
# Cross-Encoders
SentenceTransformers also supports to load Cross-Encoders for sentence pair scoring and sentence pair classification tasks.
## Bi-Encoder vs. Cross-Encoder
First, it is important to understand the difference between Bi- and Cross-Encoder.
**Bi-Encoders** produce for a given sentence a sentence embedding. We pass to a BERT independently the sentences A and B, which result in the sentence embeddings u and v. These sentence embedding can then be compared using cosine similarity:
![BiEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/Bi_vs_Cross-Encoder.png)
In contrast, for a **Cross-Encoder**, we pass both sentences simultaneously to the Transformer network. It produces then an output value between 0 and 1 indicating the similarity of the input sentence pair:
A **Cross-Encoder does not produce a sentence embedding**. Also, we are not able to pass individual sentences to a Cross-Encoder.
As detailed in our [paper](https://arxiv.org/abs/1908.10084), Cross-Encoder achieve better performances than Bi-Encoders. However, for many application they are not practical as they do not produce embeddings we could e.g. index or efficiently compare using cosine similarity.
## When to use Cross- / Bi-Encoders?
Cross-Encoders can be used whenever you have a pre-defined set of sentence pairs you want to score. For example, you have 100 sentence pairs and you want to get similarity scores for these 100 pairs.
Bi-Encoders (see [Computing Sentence Embeddings](../computing-embeddings/README.md)) are used whenever you need a sentence embedding in a vector space for efficient comparison. Applications are for example Information Retrieval / Semantic Search or Clustering. Cross-Encoders would be the wrong choice for these application: Clustering 10,000 sentence with CrossEncoders would require computing similarity scores for about 50 Million sentence combinations, which takes about 65 hours. With a Bi-Encoder, you compute the embedding for each sentence, which takes only 5 seconds. You can then perform the clustering.
## Cross-Encoders Usage
Using Cross-Encoders is quite easy:
```python
from sentence_transformers.cross_encoder import CrossEncoder
model = CrossEncoder("model_name_or_path")
scores = model.predict([["My first", "sentence pair"], ["Second text", "pair"]])
```
You pass to `model.predict` a list of sentence **pairs**. Note, Cross-Encoder do not work on individual sentence, you have to pass sentence pairs.
As model name, you can pass any model or path that is compatible with Huggingface [AutoModel](https://huggingface.co/transformers/model_doc/auto.html) class
For a full example, to score a query with all possible sentences in a corpus see [cross-encoder_usage.py](cross-encoder_usage.py).
## Combining Bi- and Cross-Encoders
Cross-Encoder achieve higher performance than Bi-Encoders, however, they do not scale well for large datasets. Here, it can make sense to combine Cross- and Bi-Encoders, for example in Information Retrieval / Semantic Search scenarios: First, you use an efficient Bi-Encoder to retrieve e.g. the top-100 most similar sentences for a query. Then, you use a Cross-Encoder to re-rank these 100 hits by computing the score for every (query, hit) combination.
For more details on combing Bi- and Cross-Encoders, see [Application - Information Retrieval](../retrieve_rerank/README.md).
## Training Cross-Encoders
See [Cross-Encoder Training](../../training/cross-encoder/README.md) how to train your own Cross-Encoder models.
"""
This script contains an example how to perform re-ranking with a Cross-Encoder for semantic search.
First, we use an efficient Bi-Encoder to retrieve similar questions from the Quora Duplicate Questions dataset:
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
Then, we re-rank the hits from the Bi-Encoder using a Cross-Encoder.
"""
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import CrossEncoder
import os
import csv
import pickle
import time
# We use a BiEncoder (SentenceTransformer) that produces embeddings for questions.
# We then search for similar questions using cosine similarity and identify the top 100 most similar questions
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
num_candidates = 500
# To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question)
# and outputs a score 0...1 indicating the similarity.
cross_encoder_model = CrossEncoder("cross-encoder/stsb-roberta-base")
# Dataset we want to use
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 20000
# Some local file to cache computed embeddings
embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
# Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
print("Download dataset")
util.http_get(url, dataset_path)
# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
for row in reader:
corpus_sentences.add(row["question1"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences.add(row["question2"])
if len(corpus_sentences) >= max_corpus_size:
break
corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)
print("Store file on disc")
with open(embedding_cache_path, "wb") as fOut:
pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
else:
print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
corpus_sentences = cache_data["sentences"][0:max_corpus_size]
corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
while True:
inp_question = input("Please enter a question: ")
print("Input question:", inp_question)
# First, retrieve candidates using cosine similarity search
start_time = time.time()
question_embedding = model.encode(inp_question, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates)
hits = hits[0] # Get the hits for the first query
print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time))
print("Top 5 hits with cosine-similarity:")
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
# Now, do the re-ranking with the cross-encoder
start_time = time.time()
sentence_pairs = [[inp_question, corpus_sentences[hit["corpus_id"]]] for hit in hits]
ce_scores = cross_encoder_model.predict(sentence_pairs)
for idx in range(len(hits)):
hits[idx]["cross-encoder_score"] = ce_scores[idx]
# Sort list by CrossEncoder scores
hits = sorted(hits, key=lambda x: x["cross-encoder_score"], reverse=True)
print("\nRe-ranking with CrossEncoder took {:.3f} seconds".format(time.time() - start_time))
print("Top 5 hits with CrossEncoder:")
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["cross-encoder_score"], corpus_sentences[hit["corpus_id"]]))
print("\n\n========\n")
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
# Pre-trained cross encoder
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
# We want to compute the similarity between the query sentence
query = "A man is eating pasta."
# With all sentences in the corpus
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"The girl is carrying a baby.",
"A man is riding a horse.",
"A woman is playing violin.",
"Two men pushed carts through the woods.",
"A man is riding a white horse on an enclosed ground.",
"A monkey is playing drums.",
"A cheetah is running behind its prey.",
]
# 1. We rank all sentences in the corpus for the query
ranks = model.rank(query, corpus)
# Print the scores
print("Query:", query)
for rank in ranks:
print(f"{rank['score']:.2f}\t{corpus[rank['corpus_id']]}")
# 2. Alternatively, you can also manually compute the score between two sentences
sentence_combinations = [[query, sentence] for sentence in corpus]
scores = model.predict(sentence_combinations)
# Sort the scores in decreasing order to get the corpus indices
ranked_indices = np.argsort(scores)[::-1]
print("scores:", scores)
print("indices:", ranked_indices)
# Embedding Quantization
Embeddings may be challenging to scale up, which leads to expensive solutions and high latencies. Currently, many state-of-the-art models produce embeddings with 1024 dimensions, each of which is encoded in `float32`, i.e., they require 4 bytes per dimension. To perform retrieval over 50 million vectors, you would therefore need around 200GB of memory. This tends to require complex and costly solutions at scale.
However, there is a new approach to counter this problem; it entails reducing the size of each of the individual values in the embedding: **Quantization**. Experiments on quantization have shown that we can maintain a large amount of performance while significantly speeding up computation and saving on memory, storage, and costs.
To learn more about Embedding Quantization and their performance, please read the [blogpost](https://huggingface.co/blog/embedding-quantization) by Sentence Transformers and mixedbread.ai.
## Binary Quantization
Binary quantization refers to the conversion of the `float32` values in an embedding to 1-bit values, resulting in a 32x reduction in memory and storage usage. To quantize `float32` embeddings to binary, we simply threshold normalized embeddings at 0: if the value is larger than 0, we make it 1, otherwise we convert it to 0. We can use the Hamming Distance to efficiently perform retrieval with these binary embeddings. This is simply the number of positions at which the bits of two binary embeddings differ. The lower the Hamming Distance, the closer the embeddings, and thus the more relevant the document. A huge advantage of the Hamming Distance is that it can be easily calculated with 2 CPU cycles, allowing for blazingly fast performance.
[Yamada et al. (2021)](https://arxiv.org/abs/2106.00882) introduced a rescore step, which they called *rerank*, to boost the performance. They proposed that the `float32` query embedding could be compared with the binary document embeddings using dot-product. In practice, we first retrieve `rescore_multiplier * top_k` results with the binary query embedding and the binary document embeddings -- i.e., the list of the first k results of the double-binary retrieval -- and then rescore that list of binary document embeddings with the `float32` query embedding.
By applying this novel rescoring step, we are able to preserve up to ~96% of the total retrieval performance, while reducing the memory and disk space usage by 32x and improving the retrieval speed by up to 32x as well.
### Binary Quantization in Sentence Transformers
Quantizing an embedding with a dimensionality of 1024 to binary would result in 1024 bits. In practice, it is much more common to store bits as bytes instead, so when we quantize to binary embeddings, we pack the bits into bytes using `np.packbits`.
As a result, in practice quantizing a `float32` embedding with a dimensionality of 1024 yields an `int8` or `uint8` embedding with a dimensionality of 128. See two approaches of how you can produce quantized embeddings using Sentence Transformers below:
```python
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
# 1. Load an embedding model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 2a. Encode some text using "binary" quantization
binary_embeddings = model.encode(
["I am driving to the lake.", "It is a beautiful day."],
precision="binary",
)
# 2b. or, encode some text without quantization & apply quantization afterwards
embeddings = model.encode(["I am driving to the lake.", "It is a beautiful day."])
binary_embeddings = quantize_embeddings(embeddings, precision="binary")
```
**References:**
* <a href="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1"><code>mixedbread-ai/mxbai-embed-large-v1</code></a>
* <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformer.encode</code></a>
* <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a>
Here you can see the differences between default `float32` embeddings and binary embeddings in terms of shape, size, and `numpy` dtype:
```python
>>> embeddings.shape
(2, 1024)
>>> embeddings.nbytes
8192
>>> embeddings.dtype
float32
>>> binary_embeddings.shape
(2, 128)
>>> binary_embeddings.nbytes
256
>>> binary_embeddings.dtype
int8
```
Note that you can also choose `"ubinary"` to quantize to binary using the unsigned `uint8` data format. This may be a requirement for your vector library/database.
## Scalar (int8) Quantization
To convert the `float32` embeddings into `int8`, we use a process called scalar quantization. This involves mapping the continuous range of `float32` values to the discrete set of `int8` values, which can represent 256 distinct levels (from -128 to 127) as shown in the image below. This is done by using a large calibration dataset of embeddings. We compute the range of these embeddings, i.e. the `min` and `max` of each of the embedding dimensions. From there, we calculate the steps (buckets) in which we categorize each value.
To further boost the retrieval performance, you can optionally apply the same rescoring step as for the binary embeddings. It is important to note here that the calibration dataset has a large influence on the performance, since it defines the buckets.
### Scalar Quantization in Sentence Transformers
Quantizing an embedding with a dimensionality of 1024 to `int8` results in 1024 bytes. In practice, we can choose either `uint8` or `int8`. This choice is usually made depending on what your vector library/database supports.
In practice, it is recommended to provide the scalar quantization with either:
1. a large set of embeddings to quantize all at once, or
2. `min` and `max` ranges for each of the embedding dimensions, or
3. a large calibration dataset of embeddings from which the `min` and `max` ranges can be computed.
If none of these are the case, you will be given a warning like this:
```
Computing int8 quantization buckets based on 2 embeddings. int8 quantization is more stable with 'ranges' calculated from more embeddings or a 'calibration_embeddings' that can be used to calculate the buckets.
```
See how you can produce scalar quantized embeddings using Sentence Transformers below:
```python
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
from datasets import load_dataset
# 1. Load an embedding model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 2. Prepare an example calibration dataset
corpus = load_dataset("nq_open", split="train[:1000]")["question"]
calibration_embeddings = model.encode(corpus)
# 3. Encode some text without quantization & apply quantization afterwards
embeddings = model.encode(["I am driving to the lake.", "It is a beautiful day."])
int8_embeddings = quantize_embeddings(
embeddings,
precision="int8",
calibration_embeddings=calibration_embeddings,
)
```
**References:**
* <a href="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1"><code>mixedbread-ai/mxbai-embed-large-v1</code></a>
* <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformer.encode</code></a>
* <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a>
Here you can see the differences between default `float32` embeddings and `int8` scalar embeddings in terms of shape, size, and `numpy` dtype:
```python
>>> embeddings.shape
(2, 1024)
>>> embeddings.nbytes
8192
>>> embeddings.dtype
float32
>>> int8_embeddings.shape
(2, 1024)
>>> int8_embeddings.nbytes
2048
>>> int8_embeddings.dtype
int8
```
### Combining Binary and Scalar Quantization
It is possible to combine binary and scalar quantization to get the best of both worlds: the extreme speed from binary embeddings and the great performance preservation of scalar embeddings with rescoring. See the [demo](#demo) below for a real-life implementation of this approach involving 41 million texts from Wikipedia. The pipeline for that setup is as follows:
1. The query is embedded using the [`mixedbread-ai/mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) SentenceTransformer model.
2. The query is quantized to binary using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a> function from the `sentence-transformers` library.
3. A binary index (41M binary embeddings; 5.2GB of memory/disk space) is searched using the quantized query for the top 40 documents.
4. The top 40 documents are loaded on the fly from an int8 index on disk (41M int8 embeddings; 0 bytes of memory, 47.5GB of disk space).
5. The top 40 documents are rescored using the float32 query and the int8 embeddings to get the top 10 documents.
6. The top 10 documents are sorted by score and displayed.
Through this approach, we use 5.2GB of memory and 52GB of disk space for the indices. This is considerably less than normal retrieval, for which we would require 200GB of memory and 200GB of disk space. Especially as you scale up even further, this will result in notable reductions in both latency and costs.
## Additional extensions
Note that embedding quantization can be combined with other approaches to improve retrieval efficiency, such as [Matryoshka Embeddings](../../training/matryoshka/README.md). Additionally, the [Retrieve & Re-Rank](../retrieve_rerank/README.md) also works very well with quantized embeddings, i.e. you can still use a Cross-Encoder to rerank.
## Demo
The following demo showcases the retrieval efficiency using `exact` search through combining binary search with scalar (`int8`) rescoring. The solution requires 5GB of memory for the binary index and 50GB of disk space for the binary and scalar indices, considerably less than the 200GB of memory and disk space which would be required for regular `float32` retrieval. Additionally, retrieval is much faster.
<iframe
src="https://sentence-transformers-quantized-retrieval.hf.space"
frameborder="0"
width="100%"
height="1000"
></iframe>
## Try it yourself
The following scripts can be used to experiment with embedding quantization for retrieval & beyond. There are three categories:
* **Recommended Retrieval**:
* [semantic_search_recommended.py](semantic_search_recommended.py): This script combines binary search with scalar rescoring, much like the above demo, for cheap, efficient, and performant retrieval.
* **Usage**:
* [semantic_search_faiss.py](semantic_search_faiss.py): This script showcases regular usage of binary or scalar quantization, retrieval, and rescoring using FAISS, by using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_faiss"><code>semantic_search_faiss</code></a> utility function.
* [semantic_search_usearch.py](semantic_search_usearch.py): This script showcases regular usage of binary or scalar quantization, retrieval, and rescoring using USearch, by using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_usearch"><code>semantic_search_usearch</code></a> utility function.
* **Benchmarks**:
* [semantic_search_faiss_benchmark.py](semantic_search_faiss_benchmark.py): This script includes a retrieval speed benchmark of `float32` retrieval, binary retrieval + rescoring, and scalar retrieval + rescoring, using FAISS. It uses the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_faiss"><code>semantic_search_faiss</code></a> utility function. Our benchmarks especially show show speedups for `ubinary`.
* [semantic_search_usearch_benchmark.py](semantic_search_usearch_benchmark.py): This script includes a retrieval speed benchmark of `float32` retrieval, binary retrieval + rescoring, and scalar retrieval + rescoring, using USearch. It uses the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_usearch"><code>semantic_search_usearch</code></a> utility function. Our experiments show large speedups on newer hardware, particularly for `int8`.
import time
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings, semantic_search_faiss
from datasets import load_dataset
# 1. Load the quora corpus with questions
dataset = load_dataset("quora", split="train").map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
)
max_corpus_size = 100_000
corpus = dataset["text"][:max_corpus_size]
# 2. Come up with some queries
queries = [
"How do I become a good programmer?",
"How do I become a good data scientist?",
]
# 3. Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 4. Choose a target precision for the corpus embeddings
corpus_precision = "ubinary"
# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
# But FAISS only supports "float32", "uint8", and "ubinary"
# 5. Encode the corpus
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
# the query embeddings. This is important only if you are using uint8 or int8 precision
# Initially, we don't have a FAISS index yet, we can use semantic_search_faiss to create it
corpus_index = None
while True:
# 7. Encode the queries using the full precision
start_time = time.time()
query_embeddings = model.encode(queries, normalize_embeddings=True)
print(f"Encoding time: {time.time() - start_time:.6f} seconds")
# 8. Perform semantic search using FAISS
results, search_time, corpus_index = semantic_search_faiss(
query_embeddings,
corpus_index=corpus_index,
corpus_embeddings=corpus_embeddings if corpus_index is None else None,
corpus_precision=corpus_precision,
top_k=10,
calibration_embeddings=full_corpus_embeddings,
rescore=corpus_precision != "float32",
rescore_multiplier=4,
exact=True,
output_index=True,
)
# This is a helper function to showcase how FAISS can be used with quantized embeddings.
# You must either provide the `corpus_embeddings` or the `corpus_index` FAISS index, but not both.
# In the first call we'll provide the `corpus_embeddings` and get the `corpus_index` back, which
# we'll use in the next call. In practice, the index is stored in RAM or saved to disk, and not
# recalculated for every query.
# This function will 1) quantize the query embeddings to the same precision as the corpus embeddings,
# 2) perform the semantic search using FAISS, 3) rescore the results using the full precision embeddings,
# and 4) return the results and the search time (and perhaps the FAISS index).
# `corpus_precision` must be the same as the precision used to quantize the corpus embeddings.
# It is used to convert the query embeddings to the same precision as the corpus embeddings.
# `top_k` determines how many results are returned for each query.
# `rescore_multiplier` is a parameter for the rescoring step. Rather than searching for the top_k results,
# we search for top_k * rescore_multiplier results and rescore the top_k results using the full precision embeddings.
# So, higher values of rescore_multiplier will give better results, but will be slower.
# `calibration_embeddings` is a set of embeddings used to calibrate the quantization of the query embeddings.
# This is important only if you are using uint8 or int8 precision. In practice, this is used to calculate
# the minimum and maximum values of each of the embedding dimensions, which are then used to determine the
# quantization thresholds.
# `rescore` determines whether to rescore the results using the full precision embeddings, if False & the
# corpus is quantized, the results will be very poor. `exact` determines whether to use the exact search
# or the approximate search method in FAISS.
# 9. Output the results
print("Precision:", corpus_precision)
print(f"Search time: {search_time:.6f} seconds")
for query, result in zip(queries, results):
print(f"Query: {query}")
for entry in result:
print(f"(Score: {entry['score']:.4f}) {corpus[entry['corpus_id']]}")
print("")
# 10. Prompt for more queries
queries = [input("Please enter a question: ")]
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings, semantic_search_faiss
from datasets import load_dataset
# 1. Load the quora corpus with questions
dataset = load_dataset("quora", split="train").map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
)
max_corpus_size = 100_000
corpus = dataset["text"][:max_corpus_size]
num_queries = 1_000
queries = corpus[:num_queries]
# 2. Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 3. Encode the corpus
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
# 4. Encode the queries using the full precision
query_embeddings = model.encode(queries, normalize_embeddings=True)
for exact in (True, False):
for corpus_precision in ("float32", "uint8", "ubinary"):
corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
# the query embeddings. This is important only if you are using uint8 or int8 precision
# 5. Perform semantic search using FAISS
rescore_multiplier = 4
results, search_time = semantic_search_faiss(
query_embeddings,
corpus_embeddings=corpus_embeddings,
corpus_precision=corpus_precision,
top_k=10,
calibration_embeddings=full_corpus_embeddings,
rescore=corpus_precision != "float32",
rescore_multiplier=rescore_multiplier,
exact=exact,
)
print(
f"{'Exact' if exact else 'Approximate'} search time using {corpus_precision} corpus: {search_time:.6f} seconds"
+ (f" (rescore_multiplier: {rescore_multiplier})" if corpus_precision != "float32" else "")
)
"""
This script showcases a recommended approach to perform semantic search using quantized embeddings with FAISS and usearch.
In particular, it uses binary search with int8 rescoring. The binary search is highly efficient, and its index can be kept
in memory even for massive datasets: it takes (num_dimensions * num_documents / 8) bytes, i.e. 1.19GB for 10 million embeddings.
"""
import json
import os
import time
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
from datasets import load_dataset
import faiss
from usearch.index import Index
# We use usearch as it can efficiently load int8 vectors from disk.
# Load the model
# NOTE: Because we are only comparing questions here, we will use the "query" prompt for everything.
# Normally you don't use this prompt for documents, but only for the queries
model = SentenceTransformer(
"mixedbread-ai/mxbai-embed-large-v1",
prompts={"query": "Represent this sentence for searching relevant passages: "},
default_prompt_name="query",
)
# Load a corpus with texts
dataset = load_dataset("quora", split="train").map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
)
max_corpus_size = 100_000
corpus = dataset["text"][:max_corpus_size]
# Apply some default query
query = "How do I become a good programmer?"
# Try to load the precomputed binary and int8 indices
if os.path.exists("quora_faiss_ubinary.index"):
binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary("quora_faiss_ubinary.index")
int8_view = Index.restore("quora_usearch_int8.index", view=True)
else:
# Encode the corpus using the full precision
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
# Convert the embeddings to "ubinary" for efficient FAISS search
ubinary_embeddings = quantize_embeddings(full_corpus_embeddings, "ubinary")
binary_index = faiss.IndexBinaryFlat(1024)
binary_index.add(ubinary_embeddings)
faiss.write_index_binary(binary_index, "quora_faiss_ubinary.index")
# Convert the embeddings to "int8" for efficiently loading int8 indices with usearch
int8_embeddings = quantize_embeddings(full_corpus_embeddings, "int8")
index = Index(ndim=1024, metric="ip", dtype="i8")
index.add(np.arange(len(int8_embeddings)), int8_embeddings)
index.save("quora_usearch_int8.index")
del index
# Load the int8 index as a view, which does not cost any memory
int8_view = Index.restore("quora_usearch_int8.index", view=True)
def search(query, top_k: int = 10, rescore_multiplier: int = 4):
# 1. Embed the query as float32
start_time = time.time()
query_embedding = model.encode(query)
embed_time = time.time() - start_time
# 2. Quantize the query to ubinary
start_time = time.time()
query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
quantize_time = time.time() - start_time
# 3. Search the binary index
start_time = time.time()
_scores, binary_ids = binary_index.search(query_embedding_ubinary, top_k * rescore_multiplier)
binary_ids = binary_ids[0]
search_time = time.time() - start_time
# 4. Load the corresponding int8 embeddings
start_time = time.time()
int8_embeddings = int8_view[binary_ids].astype(int)
load_time = time.time() - start_time
# 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
start_time = time.time()
scores = query_embedding @ int8_embeddings.T
rescore_time = time.time() - start_time
# 6. Sort the scores and return the top_k
start_time = time.time()
indices = (-scores).argsort()[:top_k]
top_k_indices = binary_ids[indices]
top_k_scores = scores[indices]
sort_time = time.time() - start_time
return (
top_k_scores.tolist(),
top_k_indices.tolist(),
{
"Embed Time": f"{embed_time:.4f} s",
"Quantize Time": f"{quantize_time:.4f} s",
"Search Time": f"{search_time:.4f} s",
"Load Time": f"{load_time:.4f} s",
"Rescore Time": f"{rescore_time:.4f} s",
"Sort Time": f"{sort_time:.4f} s",
"Total Retrieval Time": f"{quantize_time + search_time + load_time + rescore_time + sort_time:.4f} s",
},
)
while True:
scores, indices, timings = search(query)
# Output the results
print(f"Timings:\n{json.dumps(timings, indent=2)}")
print(f"Query: {query}")
for score, index in zip(scores, indices):
print(f"(Score: {score:.4f}) {corpus[index]}")
print("")
# 10. Prompt for more queries
query = input("Please enter a question: ")
import time
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
from datasets import load_dataset
# 1. Load the quora corpus with questions
dataset = load_dataset("quora", split="train").map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
)
max_corpus_size = 100_000
corpus = dataset["text"][:max_corpus_size]
# 2. Come up with some queries
queries = [
"How do I become a good programmer?",
"How do I become a good data scientist?",
]
# 3. Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 4. Choose a target precision for the corpus embeddings
corpus_precision = "binary"
# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
# But usearch only supports "float32", "int8", and "binary"
# 5. Encode the corpus
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
# the query embeddings. This is important only if you are using uint8 or int8 precision
# Initially, we don't have a usearch index yet, we can use semantic_search_usearch to create it
corpus_index = None
while True:
# 7. Encode the queries using the full precision
start_time = time.time()
query_embeddings = model.encode(queries, normalize_embeddings=True)
print(f"Encoding time: {time.time() - start_time:.6f} seconds")
# 8. Perform semantic search using usearch
results, search_time, corpus_index = semantic_search_usearch(
query_embeddings,
corpus_index=corpus_index,
corpus_embeddings=corpus_embeddings if corpus_index is None else None,
corpus_precision=corpus_precision,
top_k=10,
calibration_embeddings=full_corpus_embeddings,
rescore=corpus_precision != "float32",
rescore_multiplier=4,
exact=True,
output_index=True,
)
# This is a helper function to showcase how usearch can be used with quantized embeddings.
# You must either provide the `corpus_embeddings` or the `corpus_index` usearch index, but not both.
# In the first call we'll provide the `corpus_embeddings` and get the `corpus_index` back, which
# we'll use in the next call. In practice, the index is stored in RAM or saved to disk, and not
# recalculated for every query.
# This function will 1) quantize the query embeddings to the same precision as the corpus embeddings,
# 2) perform the semantic search using usearch, 3) rescore the results using the full precision embeddings,
# and 4) return the results and the search time (and perhaps the usearch index).
# `corpus_precision` must be the same as the precision used to quantize the corpus embeddings.
# It is used to convert the query embeddings to the same precision as the corpus embeddings.
# `top_k` determines how many results are returned for each query.
# `rescore_multiplier` is a parameter for the rescoring step. Rather than searching for the top_k results,
# we search for top_k * rescore_multiplier results and rescore the top_k results using the full precision embeddings.
# So, higher values of rescore_multiplier will give better results, but will be slower.
# `calibration_embeddings` is a set of embeddings used to calibrate the quantization of the query embeddings.
# This is important only if you are using uint8 or int8 precision. In practice, this is used to calculate
# the minimum and maximum values of each of the embedding dimensions, which are then used to determine the
# quantization thresholds.
# `rescore` determines whether to rescore the results using the full precision embeddings, if False & the
# corpus is quantized, the results will be very poor. `exact` determines whether to use the exact search
# or the approximate search method in usearch.
# 9. Output the results
print(f"Search time: {search_time:.6f} seconds")
for query, result in zip(queries, results):
print(f"Query: {query}")
for entry in result:
print(f"(Score: {entry['score']:.4f}) {corpus[entry['corpus_id']]}")
print("")
# 10. Prompt for more queries
queries = [input("Please enter a question: ")]
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
from datasets import load_dataset
# 1. Load the quora corpus with questions
dataset = load_dataset("quora", split="train").map(
lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
batched=True,
remove_columns=["questions", "is_duplicate"],
)
max_corpus_size = 100_000
corpus = dataset["text"][:max_corpus_size]
num_queries = 1_000
queries = corpus[:num_queries]
# 2. Load the model
model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
# 3. Encode the corpus
full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
# 4. Encode the queries using the full precision
query_embeddings = model.encode(queries, normalize_embeddings=True)
for exact in (True, False):
for corpus_precision in ("float32", "int8", "binary"):
corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
# the query embeddings. This is important only if you are using uint8 or int8 precision
# 5. Perform semantic search using usearch
rescore_multiplier = 4
results, search_time = semantic_search_usearch(
query_embeddings,
corpus_embeddings=corpus_embeddings,
corpus_precision=corpus_precision,
top_k=10,
calibration_embeddings=full_corpus_embeddings,
rescore=corpus_precision != "float32",
rescore_multiplier=rescore_multiplier,
exact=exact,
)
print(
f"{'Exact' if exact else 'Approximate'} search time using {corpus_precision} corpus: {search_time:.6f} seconds"
+ (f" (rescore_multiplier: {rescore_multiplier})" if corpus_precision != "float32" else "")
)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment