First add

0fccd232 · Rayyyyy · 0fccd232 · 0fccd232 · 0fccd232 · 0fccd232
Commit 0fccd232 authored May 27, 2024 by Rayyyyy
20 changed files
--- a/examples/applications/clustering/agglomerative.py
+++ b/examples/applications/clustering/agglomerative.py
+"""
+This is a simple application for sentence embeddings: clustering
+
+Sentences are mapped to sentence embeddings and then agglomerative clustering with a threshold is applied.
+"""
+
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import AgglomerativeClustering
+
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Corpus with example sentences
+corpus = [
+    "A man is eating food.",
+    "A man is eating a piece of bread.",
+    "A man is eating pasta.",
+    "The girl is carrying a baby.",
+    "The baby is carried by the woman",
+    "A man is riding a horse.",
+    "A man is riding a white horse on an enclosed ground.",
+    "A monkey is playing drums.",
+    "Someone in a gorilla costume is playing a set of drums.",
+    "A cheetah is running behind its prey.",
+    "A cheetah chases prey on across a field.",
+]
+corpus_embeddings = embedder.encode(corpus)
+
+# Some models don't automatically normalize the embeddings, in which case you should normalize the embeddings:
+# corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)
+
+# Perform kmean clustering
+clustering_model = AgglomerativeClustering(
+    n_clusters=None, distance_threshold=1.5
+)  # , affinity='cosine', linkage='average', distance_threshold=0.4)
+clustering_model.fit(corpus_embeddings)
+cluster_assignment = clustering_model.labels_
+
+clustered_sentences = {}
+for sentence_id, cluster_id in enumerate(cluster_assignment):
+    if cluster_id not in clustered_sentences:
+        clustered_sentences[cluster_id] = []
+
+    clustered_sentences[cluster_id].append(corpus[sentence_id])
+
+for i, cluster in clustered_sentences.items():
+    print("Cluster ", i + 1)
+    print(cluster)
+    print("")
--- a/examples/applications/clustering/fast_clustering.py
+++ b/examples/applications/clustering/fast_clustering.py
+"""
+This is a more complex example on performing clustering on large scale dataset.
+
+This examples find in a large set of sentences local communities, i.e., groups of sentences that are highly
+similar. You can freely configure the threshold what is considered as similar. A high threshold will
+only find extremely similar sentences, a lower threshold will find more sentence that are less similar.
+
+A second parameter is 'min_community_size': Only communities with at least a certain number of sentences will be returned.
+
+The method for finding the communities is extremely fast, for clustering 50k sentences it requires only 5 seconds (plus embedding comuptation).
+
+In this example, we download a large set of questions from Quora and then find similar questions in this set.
+"""
+
+from sentence_transformers import SentenceTransformer, util
+import os
+import csv
+import time
+
+
+# Model for computing sentence embeddings. We use one trained for similar questions detection
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# We download the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
+# and find similar question in it
+url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+dataset_path = "quora_duplicate_questions.tsv"
+max_corpus_size = 50000  # We limit our corpus to only the first 50k questions
+
+
+# Check if the dataset exists. If not, download and extract
+# Download dataset if needed
+if not os.path.exists(dataset_path):
+    print("Download dataset")
+    util.http_get(url, dataset_path)
+
+# Get all unique sentences from the file
+corpus_sentences = set()
+with open(dataset_path, encoding="utf8") as fIn:
+    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+    for row in reader:
+        corpus_sentences.add(row["question1"])
+        corpus_sentences.add(row["question2"])
+        if len(corpus_sentences) >= max_corpus_size:
+            break
+
+corpus_sentences = list(corpus_sentences)
+print("Encode the corpus. This might take a while")
+corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)
+
+
+print("Start clustering")
+start_time = time.time()
+
+# Two parameters to tune:
+# min_cluster_size: Only consider cluster that have at least 25 elements
+# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
+clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)
+
+print("Clustering done after {:.2f} sec".format(time.time() - start_time))
+
+# Print for all clusters the top 3 and bottom 3 elements
+for i, cluster in enumerate(clusters):
+    print("\nCluster {}, #{} Elements ".format(i + 1, len(cluster)))
+    for sentence_id in cluster[0:3]:
+        print("\t", corpus_sentences[sentence_id])
+    print("\t", "...")
+    for sentence_id in cluster[-3:]:
+        print("\t", corpus_sentences[sentence_id])
--- a/examples/applications/clustering/kmeans.py
+++ b/examples/applications/clustering/kmeans.py
+"""
+This is a simple application for sentence embeddings: clustering
+
+Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
+"""
+
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import KMeans
+
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Corpus with example sentences
+corpus = [
+    "A man is eating food.",
+    "A man is eating a piece of bread.",
+    "A man is eating pasta.",
+    "The girl is carrying a baby.",
+    "The baby is carried by the woman",
+    "A man is riding a horse.",
+    "A man is riding a white horse on an enclosed ground.",
+    "A monkey is playing drums.",
+    "Someone in a gorilla costume is playing a set of drums.",
+    "A cheetah is running behind its prey.",
+    "A cheetah chases prey on across a field.",
+]
+corpus_embeddings = embedder.encode(corpus)
+
+# Perform kmean clustering
+num_clusters = 5
+clustering_model = KMeans(n_clusters=num_clusters)
+clustering_model.fit(corpus_embeddings)
+cluster_assignment = clustering_model.labels_
+
+clustered_sentences = [[] for i in range(num_clusters)]
+for sentence_id, cluster_id in enumerate(cluster_assignment):
+    clustered_sentences[cluster_id].append(corpus[sentence_id])
+
+for i, cluster in enumerate(clustered_sentences):
+    print("Cluster ", i + 1)
+    print(cluster)
+    print("")
--- a/examples/applications/computing-embeddings/README.md
+++ b/examples/applications/computing-embeddings/README.md
+# Computing Sentence Embeddings
+
+
+
+The basic function to compute sentence embeddings looks like this:
+```python
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Our sentences we like to encode
+sentences = [
+    "This framework generates embeddings for each input sentence",
+    "Sentences are passed as a list of strings.",
+    "The quick brown fox jumps over the lazy dog.",
+]
+
+# Sentences are encoded by calling model.encode()
+embeddings = model.encode(sentences)
+
+# Print the embeddings
+for sentence, embedding in zip(sentences, embeddings):
+    print("Sentence:", sentence)
+    print("Embedding:", embedding)
+    print("")
+```
+
+**Note:** Even though we talk about sentence embeddings, you can use it also for shorter phrases as well as for longer texts with multiple sentences. See the section on Input Sequence Length for more notes on embeddings for paragraphs.
+
+First, we load a sentence-transformer model:
+```python
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("model_name_or_path")
+```
+
+You can either specify a [pre-trained model](https://www.sbert.net/docs/pretrained_models.html) or you can pass a path on your disc to load the sentence-transformer model from that folder.
+
+If available, the model is automatically executed on the GPU. You can specify the device for the model like this:
+```python
+model = SentenceTransformer("model_name_or_path", device="cuda")
+```
+
+With *device* any pytorch device (like CPU, cuda, cuda:0 etc.)
+ 
+
+The relevant method to encode a set of sentences / texts is `model.encode()`. In the following, you can find parameters this method accepts. Some relevant parameters are *batch_size* (depending on your GPU a different batch size is optimal) as well as *convert_to_numpy* (returns a numpy matrix)  and *convert_to_tensor* (returns a pytorch tensor).
+
+```eval_rst
+.. autoclass:: sentence_transformers.SentenceTransformer
+    :members: encode
+```
+
+## Prompt Templates
+Some models require using specific text *prompts* to achieve optimal performance. For example, with [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) you should prefix all queries with `query: ` and all passages with `passage: `. Another example is [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5), which performs best for retrieval when the input texts are prefixed with `Represent this sentence for searching relevant passages: `. 
+
+Sentence Transformer models can be initialized with `prompts` and `default_prompt_name` parameters:
+* `prompts` is an optional argument that accepts a dictionary of prompts with prompt names to prompt texts. The prompt will be prepended to the input text during inference. For example,
+    ```python
+    model = SentenceTransformer(
+        "intfloat/multilingual-e5-large",
+        prompts={
+            "classification": "Classify the following text: ",
+            "retrieval": "Retrieve semantically similar text: ",
+            "clustering": "Identify the topic or theme based on the text: ",
+        },
+    )
+    # or
+    model.prompts = {
+        "classification": "Classify the following text: ",
+        "retrieval": "Retrieve semantically similar text: ",
+        "clustering": "Identify the topic or theme based on the text: ",
+    }
+    ```
+* `default_prompt_name` is an optional argument that determines the default prompt to be used. It has to correspond with a prompt name from `prompts`. If `None`, then no prompt is used by default. For example,
+    ```python
+    model = SentenceTransformer(
+        "intfloat/multilingual-e5-large",
+        prompts={
+            "classification": "Classify the following text: ",
+            "retrieval": "Retrieve semantically similar text: ",
+            "clustering": "Identify the topic or theme based on the text: ",
+        },
+        default_prompt_name="retrieval",
+    )
+    # or
+    model.default_prompt_name="retrieval"
+    ```
+Both of these parameters can also be specified in the `config_sentence_transformers.json` file of a saved model. That way, you won't have to specify these options manually when loading. When you save a Sentence Transformer model, these options will be automatically saved as well.
+
+
+During inference, prompts can be applied in a few different ways. All of these scenarios result in identical texts being embedded:
+1. Explicitly using the `prompt` option in `SentenceTransformer.encode`:
+    ```python
+    embeddings = model.encode("How to bake a strawberry cake", prompt="Retrieve semantically similar text: ")
+    ```
+2. Explicitly using the `prompt_name` option in `SentenceTransformer.encode` by relying on the prompts loaded from a) initialization or b) the model config.
+    ```python
+    embeddings = model.encode("How to bake a strawberry cake", prompt_name="retrieval")
+    ```
+3. If `prompt` nor `prompt_name` are specified in `SentenceTransformer.encode`, then the prompt specified by `default_prompt_name` will be applied. If it is `None`, then no prompt will be applied.
+    ```python
+    embeddings = model.encode("How to bake a strawberry cake")
+    ```
+
+
+## Input Sequence Length
+Transformer models like BERT / RoBERTa / DistilBERT etc. the runtime and the memory requirement grows quadratic with the input length. This limits transformers to inputs of certain lengths. A common value for BERT & Co. are 512 word pieces, which corresponds to about 300-400 words (for English). Longer texts than this are truncated to the first x word pieces.
+
+By default, the provided methods use a limit of 128 word pieces, longer inputs will be truncated. You can get and set the maximal sequence length like this:
+ 
+```python
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+print("Max Sequence Length:", model.max_seq_length)
+
+# Change the length to 200
+model.max_seq_length = 200
+
+print("Max Sequence Length:", model.max_seq_length)
+```
+
+**Note:** You cannot increase the length higher than what is maximally supported by the respective transformer model. Also note that if a model was trained on short texts, the representations for long texts might not be that good.
+
+## Storing & Loading Embeddings
+The easiest method is to use *pickle* to store pre-computed embeddings on disc and to load it from disc. This can especially be useful if you need to encode large set of sentences. 
+
+
+```python
+from sentence_transformers import SentenceTransformer
+import pickle
+
+model = SentenceTransformer("all-MiniLM-L6-v2")
+sentences = [
+    "This framework generates embeddings for each input sentence",
+    "Sentences are passed as a list of string.",
+    "The quick brown fox jumps over the lazy dog.",
+]
+
+
+embeddings = model.encode(sentences)
+
+# Store sentences & embeddings on disc
+with open("embeddings.pkl", "wb") as fOut:
+    pickle.dump({"sentences": sentences, "embeddings": embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
+
+# Load sentences & embeddings from disc
+with open("embeddings.pkl", "rb") as fIn:
+    stored_data = pickle.load(fIn)
+    stored_sentences = stored_data["sentences"]
+    stored_embeddings = stored_data["embeddings"]
+```
+
+## Multi-Process / Multi-GPU Encoding
+
+You can encode input texts with more than one GPU (or with multiple processes on a CPU machine). For an example, see: [computing_embeddings_multi_gpu.py](computing_embeddings_multi_gpu.py).
+
+The relevant method is `start_multi_process_pool()`, which starts multiple processes that are used for encoding.
+
+ ```eval_rst
+.. automethod:: sentence_transformers.SentenceTransformer.start_multi_process_pool
+```
+
+## Sentence Embeddings with Transformers
+Most of our pre-trained models are based on [Huggingface.co/Transformers](https://huggingface.co/transformers/) and are also hosted in the [models repository](https://huggingface.co/models) from Huggingface. It is possible to use our sentence embeddings models without installing sentence-transformers:
+
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+
+
+# Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+
+
+# Sentences we want sentence embeddings for
+sentences = [
+    "This framework generates embeddings for each input sentence",
+    "Sentences are passed as a list of string.",
+    "The quick brown fox jumps over the lazy dog.",
+]
+
+# Load AutoModel from huggingface model repository
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+
+# Tokenize sentences
+encoded_input = tokenizer(
+    sentences, padding=True, truncation=True, max_length=128, return_tensors="pt"
+)
+
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+
+# Perform pooling. In this case, mean pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
+```
+
+
+You can find the available models here: [https://huggingface.co/sentence-transformers](https://huggingface.co/sentence-transformers)
+
+
+In the above example we add mean pooling on top of the AutoModel (which will load a BERT model). We also have models with max-pooling and where we use the CLS token. How to apply this pooling correctly, have a look at [sentence-transformers/bert-base-nli-max-tokens](https://huggingface.co/sentence-transformers/bert-base-nli-max-tokens) and [/sentence-transformers/bert-base-nli-cls-token](https://huggingface.co/sentence-transformers/bert-base-nli-cls-token).
+
+
--- a/examples/applications/computing-embeddings/computing_embeddings.py
+++ b/examples/applications/computing-embeddings/computing_embeddings.py
+"""
+This basic example loads a pre-trained model from the web and uses it to
+generate sentence embeddings for a given list of sentences.
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+import numpy as np
+import logging
+
+#### Just some code to print debug information to stdout
+np.set_printoptions(threshold=100)
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+#### /print debug information to stdout
+
+
+# Load pre-trained Sentence Transformer Model. It will be downloaded automatically
+model = SentenceTransformer("all-MiniLM-L6-v2")
+
+# Embed a list of sentences
+sentences = [
+    "This framework generates embeddings for each input sentence",
+    "Sentences are passed as a list of string.",
+    "The quick brown fox jumps over the lazy dog.",
+]
+sentence_embeddings = model.encode(sentences)
+
+# The result is a list of sentence embeddings as numpy arrays
+for sentence, embedding in zip(sentences, sentence_embeddings):
+    print("Sentence:", sentence)
+    print("Embedding:", embedding)
+    print("")
--- a/examples/applications/computing-embeddings/computing_embeddings_multi_gpu.py
+++ b/examples/applications/computing-embeddings/computing_embeddings_multi_gpu.py
+"""
+This example starts multiple processes (1 per GPU), which encode
+sentences in parallel. This gives a near linear speed-up
+when encoding large text collections.
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+import logging
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+
+# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
+if __name__ == "__main__":
+    # Create a large list of 100k sentences
+    sentences = ["This is sentence {}".format(i) for i in range(100000)]
+
+    # Define the model
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+
+    # Start the multi-process pool on all available CUDA devices
+    pool = model.start_multi_process_pool()
+
+    # Compute the embeddings using the multi-process pool
+    emb = model.encode_multi_process(sentences, pool)
+    print("Embeddings computed. Shape:", emb.shape)
+
+    # Optional: Stop the processes in the pool
+    model.stop_multi_process_pool(pool)
--- a/examples/applications/computing-embeddings/computing_embeddings_streaming.py
+++ b/examples/applications/computing-embeddings/computing_embeddings_streaming.py
+"""
+This example starts multiple processes (1 per GPU), which encode
+sentences in parallel. This gives a near linear speed-up
+when encoding large text collections.
+It also demonstrates how to stream data which is helpful in case you don't
+want to wait for an extremely large dataset to download, or if you want to
+limit the amount of memory used. More info about dataset streaming:
+https://huggingface.co/docs/datasets/stream
+"""
+
+from sentence_transformers import SentenceTransformer, LoggingHandler
+import logging
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+logging.basicConfig(
+    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
+)
+
+# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
+if __name__ == "__main__":
+    # Set params
+    data_stream_size = 16384  # Size of the data that is loaded into memory at once
+    chunk_size = 1024  # Size of the chunks that are sent to each process
+    encode_batch_size = 128  # Batch size of the model
+
+    # Load a large dataset in streaming mode. more info: https://huggingface.co/docs/datasets/stream
+    dataset = load_dataset("yahoo_answers_topics", split="train", streaming=True)
+    dataloader = DataLoader(dataset.with_format("torch"), batch_size=data_stream_size)
+
+    # Define the model
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+
+    # Start the multi-process pool on all available CUDA devices
+    pool = model.start_multi_process_pool()
+
+    for i, batch in enumerate(tqdm(dataloader)):
+        # Compute the embeddings using the multi-process pool
+        sentences = batch["best_answer"]
+        batch_emb = model.encode_multi_process(sentences, pool, chunk_size=chunk_size, batch_size=encode_batch_size)
+        print("Embeddings computed for 1 batch. Shape:", batch_emb.shape)
+
+    # Optional: Stop the processes in the pool
+    model.stop_multi_process_pool(pool)
--- a/examples/applications/cross-encoder/README.md
+++ b/examples/applications/cross-encoder/README.md
+# Cross-Encoders
+SentenceTransformers also supports to load Cross-Encoders for sentence pair scoring and sentence pair classification tasks.
+
+
+## Bi-Encoder vs. Cross-Encoder
+
+First, it is important to understand the difference between Bi- and Cross-Encoder.
+
+**Bi-Encoders** produce for a given sentence a sentence embedding. We pass to a BERT independently the sentences A and B, which result in the sentence embeddings u and v. These sentence embedding can then be compared using cosine similarity:
+
+![BiEncoder](https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/Bi_vs_Cross-Encoder.png)
+
+
+In contrast, for a **Cross-Encoder**,  we pass both sentences simultaneously to the Transformer network. It produces then an output value between 0 and 1 indicating the similarity of the input sentence pair: 
+
+
+
+A **Cross-Encoder does not produce a sentence embedding**. Also, we are not able to pass individual sentences to a Cross-Encoder.
+
+As detailed in our [paper](https://arxiv.org/abs/1908.10084), Cross-Encoder achieve better performances than Bi-Encoders. However, for many application they are not practical as they do not produce embeddings we could e.g. index or efficiently compare using cosine similarity.
+
+
+## When to use Cross- / Bi-Encoders?
+
+Cross-Encoders can be used whenever you have a pre-defined set of sentence pairs you want to score. For example, you have 100 sentence pairs and you want to get similarity scores for these 100 pairs.
+
+
+Bi-Encoders (see [Computing Sentence Embeddings](../computing-embeddings/README.md)) are used whenever you need a sentence embedding in a vector space for efficient comparison. Applications are for example Information Retrieval / Semantic Search or Clustering. Cross-Encoders would be the wrong choice for these application: Clustering 10,000 sentence with CrossEncoders would require computing similarity scores for about 50 Million sentence combinations, which takes about 65 hours. With a Bi-Encoder, you compute the embedding for each sentence, which takes only 5 seconds. You can then perform the clustering.
+
+
+## Cross-Encoders Usage
+Using Cross-Encoders is quite easy:
+```python
+from sentence_transformers.cross_encoder import CrossEncoder
+
+model = CrossEncoder("model_name_or_path")
+scores = model.predict([["My first", "sentence pair"], ["Second text", "pair"]])
+```
+
+You pass to `model.predict` a list of sentence **pairs**. Note, Cross-Encoder do not work on individual sentence, you have to pass sentence pairs.
+
+As model name, you can pass any model or path that is compatible with Huggingface [AutoModel](https://huggingface.co/transformers/model_doc/auto.html) class
+
+
+For a full example, to score a query with all possible sentences in a corpus see [cross-encoder_usage.py](cross-encoder_usage.py).
+
+
+
+
+
+## Combining Bi- and Cross-Encoders
+Cross-Encoder achieve higher performance than Bi-Encoders, however, they do not scale well for large datasets. Here, it can make sense to combine Cross- and Bi-Encoders, for example in Information Retrieval / Semantic Search scenarios: First, you use an efficient Bi-Encoder to retrieve e.g. the top-100 most similar sentences for a query. Then, you use a Cross-Encoder to re-rank these 100 hits by computing the score for every (query, hit) combination.
+
+For more details on combing Bi- and Cross-Encoders, see [Application - Information Retrieval](../retrieve_rerank/README.md).
+
+## Training Cross-Encoders 
+See [Cross-Encoder Training](../../training/cross-encoder/README.md) how to train your own Cross-Encoder models.
--- a/examples/applications/cross-encoder/cross-encoder_reranking.py
+++ b/examples/applications/cross-encoder/cross-encoder_reranking.py
+"""
+This script contains an example how to perform re-ranking with a Cross-Encoder for semantic search.
+
+First, we use an efficient Bi-Encoder to retrieve similar questions from the Quora Duplicate Questions dataset:
+https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs
+
+Then, we re-rank the hits from the Bi-Encoder using a Cross-Encoder.
+"""
+
+from sentence_transformers import SentenceTransformer, util
+from sentence_transformers import CrossEncoder
+import os
+import csv
+import pickle
+import time
+
+# We use a BiEncoder (SentenceTransformer) that produces embeddings for questions.
+# We then search for similar questions using cosine similarity and identify the top 100 most similar questions
+model_name = "all-MiniLM-L6-v2"
+model = SentenceTransformer(model_name)
+num_candidates = 500
+
+# To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question)
+# and outputs a score 0...1 indicating the similarity.
+cross_encoder_model = CrossEncoder("cross-encoder/stsb-roberta-base")
+
+# Dataset we want to use
+url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
+dataset_path = "quora_duplicate_questions.tsv"
+max_corpus_size = 20000
+
+# Some local file to cache computed embeddings
+embedding_cache_path = "quora-embeddings-{}-size-{}.pkl".format(model_name.replace("/", "_"), max_corpus_size)
+
+# Check if embedding cache path exists
+if not os.path.exists(embedding_cache_path):
+    # Check if the dataset exists. If not, download and extract
+    # Download dataset if needed
+    if not os.path.exists(dataset_path):
+        print("Download dataset")
+        util.http_get(url, dataset_path)
+
+    # Get all unique sentences from the file
+    corpus_sentences = set()
+    with open(dataset_path, encoding="utf8") as fIn:
+        reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            corpus_sentences.add(row["question1"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+            corpus_sentences.add(row["question2"])
+            if len(corpus_sentences) >= max_corpus_size:
+                break
+
+    corpus_sentences = list(corpus_sentences)
+    print("Encode the corpus. This might take a while")
+    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)
+
+    print("Store file on disc")
+    with open(embedding_cache_path, "wb") as fOut:
+        pickle.dump({"sentences": corpus_sentences, "embeddings": corpus_embeddings}, fOut)
+else:
+    print("Load pre-computed embeddings from disc")
+    with open(embedding_cache_path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        corpus_sentences = cache_data["sentences"][0:max_corpus_size]
+        corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
+
+###############################
+print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+
+while True:
+    inp_question = input("Please enter a question: ")
+    print("Input question:", inp_question)
+
+    # First, retrieve candidates using cosine similarity search
+    start_time = time.time()
+    question_embedding = model.encode(inp_question, convert_to_tensor=True)
+    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates)
+    hits = hits[0]  # Get the hits for the first query
+
+    print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time))
+    print("Top 5 hits with cosine-similarity:")
+    for hit in hits[0:5]:
+        print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
+
+    # Now, do the re-ranking with the cross-encoder
+    start_time = time.time()
+    sentence_pairs = [[inp_question, corpus_sentences[hit["corpus_id"]]] for hit in hits]
+    ce_scores = cross_encoder_model.predict(sentence_pairs)
+
+    for idx in range(len(hits)):
+        hits[idx]["cross-encoder_score"] = ce_scores[idx]
+
+    # Sort list by CrossEncoder scores
+    hits = sorted(hits, key=lambda x: x["cross-encoder_score"], reverse=True)
+    print("\nRe-ranking with CrossEncoder took {:.3f} seconds".format(time.time() - start_time))
+    print("Top 5 hits with CrossEncoder:")
+    for hit in hits[0:5]:
+        print("\t{:.3f}\t{}".format(hit["cross-encoder_score"], corpus_sentences[hit["corpus_id"]]))
+
+    print("\n\n========\n")
--- a/examples/applications/cross-encoder/cross-encoder_usage.py
+++ b/examples/applications/cross-encoder/cross-encoder_usage.py
+"""
+This example computes the score between a query and all possible
+sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
+It output then the most similar sentences for the given query.
+"""
+
+from sentence_transformers.cross_encoder import CrossEncoder
+import numpy as np
+
+# Pre-trained cross encoder
+model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
+
+# We want to compute the similarity between the query sentence
+query = "A man is eating pasta."
+
+# With all sentences in the corpus
+corpus = [
+    "A man is eating food.",
+    "A man is eating a piece of bread.",
+    "The girl is carrying a baby.",
+    "A man is riding a horse.",
+    "A woman is playing violin.",
+    "Two men pushed carts through the woods.",
+    "A man is riding a white horse on an enclosed ground.",
+    "A monkey is playing drums.",
+    "A cheetah is running behind its prey.",
+]
+
+# 1. We rank all sentences in the corpus for the query
+ranks = model.rank(query, corpus)
+
+# Print the scores
+print("Query:", query)
+for rank in ranks:
+    print(f"{rank['score']:.2f}\t{corpus[rank['corpus_id']]}")
+
+# 2. Alternatively, you can also manually compute the score between two sentences
+sentence_combinations = [[query, sentence] for sentence in corpus]
+scores = model.predict(sentence_combinations)
+
+# Sort the scores in decreasing order to get the corpus indices
+ranked_indices = np.argsort(scores)[::-1]
+print("scores:", scores)
+print("indices:", ranked_indices)
--- a/examples/applications/embedding-quantization/README.md
+++ b/examples/applications/embedding-quantization/README.md
+# Embedding Quantization
+
+Embeddings may be challenging to scale up, which leads to expensive solutions and high latencies. Currently, many state-of-the-art models produce embeddings with 1024 dimensions, each of which is encoded in `float32`, i.e., they require 4 bytes per dimension. To perform retrieval over 50 million vectors, you would therefore need around 200GB of memory. This tends to require complex and costly solutions at scale.
+
+However, there is a new approach to counter this problem; it entails reducing the size of each of the individual values in the embedding: **Quantization**. Experiments on quantization have shown that we can maintain a large amount of performance while significantly speeding up computation and saving on memory, storage, and costs.
+
+To learn more about Embedding Quantization and their performance, please read the [blogpost](https://huggingface.co/blog/embedding-quantization) by Sentence Transformers and mixedbread.ai.
+
+## Binary Quantization
+
+Binary quantization refers to the conversion of the `float32` values in an embedding to 1-bit values, resulting in a 32x reduction in memory and storage usage. To quantize `float32` embeddings to binary, we simply threshold normalized embeddings at 0: if the value is larger than 0, we make it 1, otherwise we convert it to 0. We can use the Hamming Distance to efficiently perform retrieval with these binary embeddings. This is simply the number of positions at which the bits of two binary embeddings differ. The lower the Hamming Distance, the closer the embeddings, and thus the more relevant the document. A huge advantage of the Hamming Distance is that it can be easily calculated with 2 CPU cycles, allowing for blazingly fast performance.
+
+[Yamada et al. (2021)](https://arxiv.org/abs/2106.00882) introduced a rescore step, which they called *rerank*, to boost the performance. They proposed that the `float32` query embedding could be compared with the binary document embeddings using dot-product. In practice, we first retrieve `rescore_multiplier * top_k` results with the binary query embedding and the binary document embeddings -- i.e., the list of the first k results of the double-binary retrieval --  and then rescore that list of binary document embeddings with the `float32` query embedding.
+
+By applying this novel rescoring step, we are able to preserve up to ~96% of the total retrieval performance, while reducing the memory and disk space usage by 32x and improving the retrieval speed by up to 32x as well.
+
+### Binary Quantization in Sentence Transformers
+
+Quantizing an embedding with a dimensionality of 1024 to binary would result in 1024 bits. In practice, it is much more common to store bits as bytes instead, so when we quantize to binary embeddings, we pack the bits into bytes using `np.packbits`.
+
+As a result, in practice quantizing a `float32` embedding with a dimensionality of 1024 yields an `int8` or `uint8` embedding with a dimensionality of 128. See two approaches of how you can produce quantized embeddings using Sentence Transformers below:
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings
+
+# 1. Load an embedding model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 2a. Encode some text using "binary" quantization
+binary_embeddings = model.encode(
+    ["I am driving to the lake.", "It is a beautiful day."],
+    precision="binary",
+)
+
+# 2b. or, encode some text without quantization & apply quantization afterwards
+embeddings = model.encode(["I am driving to the lake.", "It is a beautiful day."])
+binary_embeddings = quantize_embeddings(embeddings, precision="binary")
+```
+
+**References:**
+* <a href="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1"><code>mixedbread-ai/mxbai-embed-large-v1</code></a>
+* <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformer.encode</code></a>
+* <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a>
+
+Here you can see the differences between default `float32` embeddings and binary embeddings in terms of shape, size, and `numpy` dtype:
+
+```python
+>>> embeddings.shape
+(2, 1024)
+>>> embeddings.nbytes
+8192
+>>> embeddings.dtype
+float32
+>>> binary_embeddings.shape
+(2, 128)
+>>> binary_embeddings.nbytes
+256
+>>> binary_embeddings.dtype
+int8
+```
+Note that you can also choose `"ubinary"` to quantize to binary using the unsigned `uint8` data format. This may be a requirement for your vector library/database.
+
+## Scalar (int8) Quantization
+
+To convert the `float32` embeddings into `int8`, we use a process called scalar quantization. This involves mapping the continuous range of `float32` values to the discrete set of `int8` values, which can represent 256 distinct levels (from -128 to 127) as shown in the image below. This is done by using a large calibration dataset of embeddings. We compute the range of these embeddings, i.e. the `min` and `max` of each of the embedding dimensions. From there, we calculate the steps (buckets) in which we categorize each value.
+
+To further boost the retrieval performance, you can optionally apply the same rescoring step as for the binary embeddings. It is important to note here that the calibration dataset has a large influence on the performance, since it defines the buckets.
+
+### Scalar Quantization in Sentence Transformers
+
+Quantizing an embedding with a dimensionality of 1024 to `int8` results in 1024 bytes. In practice, we can choose either `uint8` or `int8`. This choice is usually made depending on what your vector library/database supports. 
+
+In practice, it is recommended to provide the scalar quantization with either:
+1. a large set of embeddings to quantize all at once, or
+2. `min` and `max` ranges for each of the embedding dimensions, or
+3. a large calibration dataset of embeddings from which the `min` and `max` ranges can be computed. 
+
+If none of these are the case, you will be given a warning like this:
+
+```
+Computing int8 quantization buckets based on 2 embeddings. int8 quantization is more stable with 'ranges' calculated from more embeddings or a 'calibration_embeddings' that can be used to calculate the buckets.
+```
+
+See how you can produce scalar quantized embeddings using Sentence Transformers below:
+
+```python
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings
+from datasets import load_dataset
+
+# 1. Load an embedding model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 2. Prepare an example calibration dataset
+corpus = load_dataset("nq_open", split="train[:1000]")["question"]
+calibration_embeddings = model.encode(corpus)
+
+# 3. Encode some text without quantization & apply quantization afterwards
+embeddings = model.encode(["I am driving to the lake.", "It is a beautiful day."])
+int8_embeddings = quantize_embeddings(
+    embeddings,
+    precision="int8",
+    calibration_embeddings=calibration_embeddings,
+)
+```
+
+**References:**
+* <a href="https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1"><code>mixedbread-ai/mxbai-embed-large-v1</code></a>
+* <a href="../../../docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"><code>SentenceTransformer.encode</code></a>
+* <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a>
+
+Here you can see the differences between default `float32` embeddings and `int8` scalar embeddings in terms of shape, size, and `numpy` dtype:
+
+```python
+>>> embeddings.shape
+(2, 1024)
+>>> embeddings.nbytes
+8192
+>>> embeddings.dtype
+float32
+>>> int8_embeddings.shape
+(2, 1024)
+>>> int8_embeddings.nbytes
+2048
+>>> int8_embeddings.dtype
+int8
+```
+
+### Combining Binary and Scalar Quantization
+
+It is possible to combine binary and scalar quantization to get the best of both worlds: the extreme speed from binary embeddings and the great performance preservation of scalar embeddings with rescoring. See the [demo](#demo) below for a real-life implementation of this approach involving 41 million texts from Wikipedia. The pipeline for that setup is as follows:
+
+1. The query is embedded using the [`mixedbread-ai/mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) SentenceTransformer model.
+2. The query is quantized to binary using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.quantize_embeddings"><code>quantize_embeddings</code></a> function from the `sentence-transformers` library.
+3. A binary index (41M binary embeddings; 5.2GB of memory/disk space) is searched using the quantized query for the top 40 documents.
+4. The top 40 documents are loaded on the fly from an int8 index on disk (41M int8 embeddings; 0 bytes of memory, 47.5GB of disk space).
+5. The top 40 documents are rescored using the float32 query and the int8 embeddings to get the top 10 documents.
+6. The top 10 documents are sorted by score and displayed.
+
+Through this approach, we use 5.2GB of memory and 52GB of disk space for the indices. This is considerably less than normal retrieval, for which we would require 200GB of memory and 200GB of disk space. Especially as you scale up even further, this will result in notable reductions in both latency and costs.
+
+## Additional extensions
+
+Note that embedding quantization can be combined with other approaches to improve retrieval efficiency, such as [Matryoshka Embeddings](../../training/matryoshka/README.md). Additionally, the [Retrieve & Re-Rank](../retrieve_rerank/README.md) also works very well with quantized embeddings, i.e. you can still use a Cross-Encoder to rerank.
+
+## Demo
+
+The following demo showcases the retrieval efficiency using `exact` search through combining binary search with scalar (`int8`) rescoring. The solution requires 5GB of memory for the binary index and 50GB of disk space for the binary and scalar indices, considerably less than the 200GB of memory and disk space which would be required for regular `float32` retrieval. Additionally, retrieval is much faster.
+
+<iframe
+	src="https://sentence-transformers-quantized-retrieval.hf.space"
+	frameborder="0"
+	width="100%"
+	height="1000"
+></iframe>
+
+## Try it yourself
+
+The following scripts can be used to experiment with embedding quantization for retrieval & beyond. There are three categories:
+
+* **Recommended Retrieval**:
+  * [semantic_search_recommended.py](semantic_search_recommended.py): This script combines binary search with scalar rescoring, much like the above demo, for cheap, efficient, and performant retrieval.
+* **Usage**:
+  * [semantic_search_faiss.py](semantic_search_faiss.py): This script showcases regular usage of binary or scalar quantization, retrieval, and rescoring using FAISS, by using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_faiss"><code>semantic_search_faiss</code></a> utility function.
+  * [semantic_search_usearch.py](semantic_search_usearch.py): This script showcases regular usage of binary or scalar quantization, retrieval, and rescoring using USearch, by using the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_usearch"><code>semantic_search_usearch</code></a> utility function.
+* **Benchmarks**:
+  * [semantic_search_faiss_benchmark.py](semantic_search_faiss_benchmark.py): This script includes a retrieval speed benchmark of `float32` retrieval, binary retrieval + rescoring, and scalar retrieval + rescoring, using FAISS. It uses the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_faiss"><code>semantic_search_faiss</code></a> utility function. Our benchmarks especially show show speedups for `ubinary`.
+  * [semantic_search_usearch_benchmark.py](semantic_search_usearch_benchmark.py): This script includes a retrieval speed benchmark of `float32` retrieval, binary retrieval + rescoring, and scalar retrieval + rescoring, using USearch. It uses the <a href="../../../docs/package_reference/quantization.html#sentence_transformers.quantization.semantic_search_usearch"><code>semantic_search_usearch</code></a> utility function. Our experiments show large speedups on newer hardware, particularly for `int8`.
--- a/examples/applications/embedding-quantization/semantic_search_faiss.py
+++ b/examples/applications/embedding-quantization/semantic_search_faiss.py
+import time
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings, semantic_search_faiss
+from datasets import load_dataset
+
+# 1. Load the quora corpus with questions
+dataset = load_dataset("quora", split="train").map(
+    lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
+    batched=True,
+    remove_columns=["questions", "is_duplicate"],
+)
+max_corpus_size = 100_000
+corpus = dataset["text"][:max_corpus_size]
+
+# 2. Come up with some queries
+queries = [
+    "How do I become a good programmer?",
+    "How do I become a good data scientist?",
+]
+
+# 3. Load the model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 4. Choose a target precision for the corpus embeddings
+corpus_precision = "ubinary"
+# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
+# But FAISS only supports "float32", "uint8", and "ubinary"
+
+# 5. Encode the corpus
+full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
+corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
+# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
+# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
+# the query embeddings. This is important only if you are using uint8 or int8 precision
+
+# Initially, we don't have a FAISS index yet, we can use semantic_search_faiss to create it
+corpus_index = None
+while True:
+    # 7. Encode the queries using the full precision
+    start_time = time.time()
+    query_embeddings = model.encode(queries, normalize_embeddings=True)
+    print(f"Encoding time: {time.time() - start_time:.6f} seconds")
+
+    # 8. Perform semantic search using FAISS
+    results, search_time, corpus_index = semantic_search_faiss(
+        query_embeddings,
+        corpus_index=corpus_index,
+        corpus_embeddings=corpus_embeddings if corpus_index is None else None,
+        corpus_precision=corpus_precision,
+        top_k=10,
+        calibration_embeddings=full_corpus_embeddings,
+        rescore=corpus_precision != "float32",
+        rescore_multiplier=4,
+        exact=True,
+        output_index=True,
+    )
+    # This is a helper function to showcase how FAISS can be used with quantized embeddings.
+    # You must either provide the `corpus_embeddings` or the `corpus_index` FAISS index, but not both.
+    # In the first call we'll provide the `corpus_embeddings` and get the `corpus_index` back, which
+    # we'll use in the next call. In practice, the index is stored in RAM or saved to disk, and not
+    # recalculated for every query.
+
+    # This function will 1) quantize the query embeddings to the same precision as the corpus embeddings,
+    # 2) perform the semantic search using FAISS, 3) rescore the results using the full precision embeddings,
+    # and 4) return the results and the search time (and perhaps the FAISS index).
+
+    # `corpus_precision` must be the same as the precision used to quantize the corpus embeddings.
+    # It is used to convert the query embeddings to the same precision as the corpus embeddings.
+    # `top_k` determines how many results are returned for each query.
+    # `rescore_multiplier` is a parameter for the rescoring step. Rather than searching for the top_k results,
+    # we search for top_k * rescore_multiplier results and rescore the top_k results using the full precision embeddings.
+    # So, higher values of rescore_multiplier will give better results, but will be slower.
+
+    # `calibration_embeddings` is a set of embeddings used to calibrate the quantization of the query embeddings.
+    # This is important only if you are using uint8 or int8 precision. In practice, this is used to calculate
+    # the minimum and maximum values of each of the embedding dimensions, which are then used to determine the
+    # quantization thresholds.
+
+    # `rescore` determines whether to rescore the results using the full precision embeddings, if False & the
+    # corpus is quantized, the results will be very poor. `exact` determines whether to use the exact search
+    # or the approximate search method in FAISS.
+
+    # 9. Output the results
+    print("Precision:", corpus_precision)
+    print(f"Search time: {search_time:.6f} seconds")
+    for query, result in zip(queries, results):
+        print(f"Query: {query}")
+        for entry in result:
+            print(f"(Score: {entry['score']:.4f}) {corpus[entry['corpus_id']]}")
+        print("")
+
+    # 10. Prompt for more queries
+    queries = [input("Please enter a question: ")]
--- a/examples/applications/embedding-quantization/semantic_search_faiss_benchmark.py
+++ b/examples/applications/embedding-quantization/semantic_search_faiss_benchmark.py
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings, semantic_search_faiss
+from datasets import load_dataset
+
+# 1. Load the quora corpus with questions
+dataset = load_dataset("quora", split="train").map(
+    lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
+    batched=True,
+    remove_columns=["questions", "is_duplicate"],
+)
+max_corpus_size = 100_000
+corpus = dataset["text"][:max_corpus_size]
+num_queries = 1_000
+queries = corpus[:num_queries]
+
+# 2. Load the model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 3. Encode the corpus
+full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
+
+# 4. Encode the queries using the full precision
+query_embeddings = model.encode(queries, normalize_embeddings=True)
+
+for exact in (True, False):
+    for corpus_precision in ("float32", "uint8", "ubinary"):
+        corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
+        # NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
+        # but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
+        # the query embeddings. This is important only if you are using uint8 or int8 precision
+
+        # 5. Perform semantic search using FAISS
+        rescore_multiplier = 4
+        results, search_time = semantic_search_faiss(
+            query_embeddings,
+            corpus_embeddings=corpus_embeddings,
+            corpus_precision=corpus_precision,
+            top_k=10,
+            calibration_embeddings=full_corpus_embeddings,
+            rescore=corpus_precision != "float32",
+            rescore_multiplier=rescore_multiplier,
+            exact=exact,
+        )
+
+        print(
+            f"{'Exact' if exact else 'Approximate'} search time using {corpus_precision} corpus: {search_time:.6f} seconds"
+            + (f" (rescore_multiplier: {rescore_multiplier})" if corpus_precision != "float32" else "")
+        )
--- a/examples/applications/embedding-quantization/semantic_search_recommended.py
+++ b/examples/applications/embedding-quantization/semantic_search_recommended.py
+"""
+This script showcases a recommended approach to perform semantic search using quantized embeddings with FAISS and usearch.
+In particular, it uses binary search with int8 rescoring. The binary search is highly efficient, and its index can be kept
+in memory even for massive datasets: it takes (num_dimensions * num_documents / 8) bytes, i.e. 1.19GB for 10 million embeddings.
+"""
+
+import json
+import os
+import time
+
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings
+from datasets import load_dataset
+
+import faiss
+from usearch.index import Index
+# We use usearch as it can efficiently load int8 vectors from disk.
+
+# Load the model
+# NOTE: Because we are only comparing questions here, we will use the "query" prompt for everything.
+# Normally you don't use this prompt for documents, but only for the queries
+model = SentenceTransformer(
+    "mixedbread-ai/mxbai-embed-large-v1",
+    prompts={"query": "Represent this sentence for searching relevant passages: "},
+    default_prompt_name="query",
+)
+
+# Load a corpus with texts
+dataset = load_dataset("quora", split="train").map(
+    lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
+    batched=True,
+    remove_columns=["questions", "is_duplicate"],
+)
+max_corpus_size = 100_000
+corpus = dataset["text"][:max_corpus_size]
+
+# Apply some default query
+query = "How do I become a good programmer?"
+
+# Try to load the precomputed binary and int8 indices
+if os.path.exists("quora_faiss_ubinary.index"):
+    binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary("quora_faiss_ubinary.index")
+    int8_view = Index.restore("quora_usearch_int8.index", view=True)
+
+else:
+    # Encode the corpus using the full precision
+    full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
+
+    # Convert the embeddings to "ubinary" for efficient FAISS search
+    ubinary_embeddings = quantize_embeddings(full_corpus_embeddings, "ubinary")
+    binary_index = faiss.IndexBinaryFlat(1024)
+    binary_index.add(ubinary_embeddings)
+    faiss.write_index_binary(binary_index, "quora_faiss_ubinary.index")
+
+    # Convert the embeddings to "int8" for efficiently loading int8 indices with usearch
+    int8_embeddings = quantize_embeddings(full_corpus_embeddings, "int8")
+    index = Index(ndim=1024, metric="ip", dtype="i8")
+    index.add(np.arange(len(int8_embeddings)), int8_embeddings)
+    index.save("quora_usearch_int8.index")
+    del index
+
+    # Load the int8 index as a view, which does not cost any memory
+    int8_view = Index.restore("quora_usearch_int8.index", view=True)
+
+
+def search(query, top_k: int = 10, rescore_multiplier: int = 4):
+    # 1. Embed the query as float32
+    start_time = time.time()
+    query_embedding = model.encode(query)
+    embed_time = time.time() - start_time
+
+    # 2. Quantize the query to ubinary
+    start_time = time.time()
+    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
+    quantize_time = time.time() - start_time
+
+    # 3. Search the binary index
+    start_time = time.time()
+    _scores, binary_ids = binary_index.search(query_embedding_ubinary, top_k * rescore_multiplier)
+    binary_ids = binary_ids[0]
+    search_time = time.time() - start_time
+
+    # 4. Load the corresponding int8 embeddings
+    start_time = time.time()
+    int8_embeddings = int8_view[binary_ids].astype(int)
+    load_time = time.time() - start_time
+
+    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
+    start_time = time.time()
+    scores = query_embedding @ int8_embeddings.T
+    rescore_time = time.time() - start_time
+
+    # 6. Sort the scores and return the top_k
+    start_time = time.time()
+    indices = (-scores).argsort()[:top_k]
+    top_k_indices = binary_ids[indices]
+    top_k_scores = scores[indices]
+    sort_time = time.time() - start_time
+
+    return (
+        top_k_scores.tolist(),
+        top_k_indices.tolist(),
+        {
+            "Embed Time": f"{embed_time:.4f} s",
+            "Quantize Time": f"{quantize_time:.4f} s",
+            "Search Time": f"{search_time:.4f} s",
+            "Load Time": f"{load_time:.4f} s",
+            "Rescore Time": f"{rescore_time:.4f} s",
+            "Sort Time": f"{sort_time:.4f} s",
+            "Total Retrieval Time": f"{quantize_time + search_time + load_time + rescore_time + sort_time:.4f} s",
+        },
+    )
+
+
+while True:
+    scores, indices, timings = search(query)
+
+    # Output the results
+    print(f"Timings:\n{json.dumps(timings, indent=2)}")
+    print(f"Query: {query}")
+    for score, index in zip(scores, indices):
+        print(f"(Score: {score:.4f}) {corpus[index]}")
+    print("")
+
+    # 10. Prompt for more queries
+    query = input("Please enter a question: ")
--- a/examples/applications/embedding-quantization/semantic_search_usearch.py
+++ b/examples/applications/embedding-quantization/semantic_search_usearch.py
+import time
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
+from datasets import load_dataset
+
+# 1. Load the quora corpus with questions
+dataset = load_dataset("quora", split="train").map(
+    lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
+    batched=True,
+    remove_columns=["questions", "is_duplicate"],
+)
+max_corpus_size = 100_000
+corpus = dataset["text"][:max_corpus_size]
+
+# 2. Come up with some queries
+queries = [
+    "How do I become a good programmer?",
+    "How do I become a good data scientist?",
+]
+
+# 3. Load the model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 4. Choose a target precision for the corpus embeddings
+corpus_precision = "binary"
+# Valid options are: "float32", "uint8", "int8", "ubinary", and "binary"
+# But usearch only supports "float32", "int8", and "binary"
+
+# 5. Encode the corpus
+full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
+corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
+# NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
+# but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
+# the query embeddings. This is important only if you are using uint8 or int8 precision
+
+# Initially, we don't have a usearch index yet, we can use semantic_search_usearch to create it
+corpus_index = None
+while True:
+    # 7. Encode the queries using the full precision
+    start_time = time.time()
+    query_embeddings = model.encode(queries, normalize_embeddings=True)
+    print(f"Encoding time: {time.time() - start_time:.6f} seconds")
+
+    # 8. Perform semantic search using usearch
+    results, search_time, corpus_index = semantic_search_usearch(
+        query_embeddings,
+        corpus_index=corpus_index,
+        corpus_embeddings=corpus_embeddings if corpus_index is None else None,
+        corpus_precision=corpus_precision,
+        top_k=10,
+        calibration_embeddings=full_corpus_embeddings,
+        rescore=corpus_precision != "float32",
+        rescore_multiplier=4,
+        exact=True,
+        output_index=True,
+    )
+    # This is a helper function to showcase how usearch can be used with quantized embeddings.
+    # You must either provide the `corpus_embeddings` or the `corpus_index` usearch index, but not both.
+    # In the first call we'll provide the `corpus_embeddings` and get the `corpus_index` back, which
+    # we'll use in the next call. In practice, the index is stored in RAM or saved to disk, and not
+    # recalculated for every query.
+
+    # This function will 1) quantize the query embeddings to the same precision as the corpus embeddings,
+    # 2) perform the semantic search using usearch, 3) rescore the results using the full precision embeddings,
+    # and 4) return the results and the search time (and perhaps the usearch index).
+
+    # `corpus_precision` must be the same as the precision used to quantize the corpus embeddings.
+    # It is used to convert the query embeddings to the same precision as the corpus embeddings.
+    # `top_k` determines how many results are returned for each query.
+    # `rescore_multiplier` is a parameter for the rescoring step. Rather than searching for the top_k results,
+    # we search for top_k * rescore_multiplier results and rescore the top_k results using the full precision embeddings.
+    # So, higher values of rescore_multiplier will give better results, but will be slower.
+
+    # `calibration_embeddings` is a set of embeddings used to calibrate the quantization of the query embeddings.
+    # This is important only if you are using uint8 or int8 precision. In practice, this is used to calculate
+    # the minimum and maximum values of each of the embedding dimensions, which are then used to determine the
+    # quantization thresholds.
+
+    # `rescore` determines whether to rescore the results using the full precision embeddings, if False & the
+    # corpus is quantized, the results will be very poor. `exact` determines whether to use the exact search
+    # or the approximate search method in usearch.
+
+    # 9. Output the results
+    print(f"Search time: {search_time:.6f} seconds")
+    for query, result in zip(queries, results):
+        print(f"Query: {query}")
+        for entry in result:
+            print(f"(Score: {entry['score']:.4f}) {corpus[entry['corpus_id']]}")
+        print("")
+
+    # 10. Prompt for more queries
+    queries = [input("Please enter a question: ")]
--- a/examples/applications/embedding-quantization/semantic_search_usearch_benchmark.py
+++ b/examples/applications/embedding-quantization/semantic_search_usearch_benchmark.py
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.quantization import quantize_embeddings, semantic_search_usearch
+from datasets import load_dataset
+
+# 1. Load the quora corpus with questions
+dataset = load_dataset("quora", split="train").map(
+    lambda batch: {"text": [text for sample in batch["questions"] for text in sample["text"]]},
+    batched=True,
+    remove_columns=["questions", "is_duplicate"],
+)
+max_corpus_size = 100_000
+corpus = dataset["text"][:max_corpus_size]
+num_queries = 1_000
+queries = corpus[:num_queries]
+
+# 2. Load the model
+model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+
+# 3. Encode the corpus
+full_corpus_embeddings = model.encode(corpus, normalize_embeddings=True, show_progress_bar=True)
+
+# 4. Encode the queries using the full precision
+query_embeddings = model.encode(queries, normalize_embeddings=True)
+
+for exact in (True, False):
+    for corpus_precision in ("float32", "int8", "binary"):
+        corpus_embeddings = quantize_embeddings(full_corpus_embeddings, precision=corpus_precision)
+        # NOTE: We can also pass "precision=..." to the encode method to quantize the embeddings directly,
+        # but we want to keep the full precision embeddings to act as a calibration dataset for quantizing
+        # the query embeddings. This is important only if you are using uint8 or int8 precision
+
+        # 5. Perform semantic search using usearch
+        rescore_multiplier = 4
+        results, search_time = semantic_search_usearch(
+            query_embeddings,
+            corpus_embeddings=corpus_embeddings,
+            corpus_precision=corpus_precision,
+            top_k=10,
+            calibration_embeddings=full_corpus_embeddings,
+            rescore=corpus_precision != "float32",
+            rescore_multiplier=rescore_multiplier,
+            exact=exact,
+        )
+
+        print(
+            f"{'Exact' if exact else 'Approximate'} search time using {corpus_precision} corpus: {search_time:.6f} seconds"
+            + (f" (rescore_multiplier: {rescore_multiplier})" if corpus_precision != "float32" else "")
+        )
--- a/examples/applications/image-search/Image_Classification.ipynb
+++ b/examples/applications/image-search/Image_Classification.ipynb
--- a/examples/applications/image-search/Image_Clustering.ipynb
+++ b/examples/applications/image-search/Image_Clustering.ipynb
--- a/examples/applications/image-search/Image_Duplicates.ipynb
+++ b/examples/applications/image-search/Image_Duplicates.ipynb
--- a/examples/applications/image-search/Image_Search-multilingual.ipynb
+++ b/examples/applications/image-search/Image_Search-multilingual.ipynb