dimensionality_reduction.py

"""
The pre-trained models produce embeddings of size 512 - 1024. However, when storing a large
number of embeddings, this requires quite a lot of memory / storage.

In this example, we reduce the dimensionality of the embeddings to e.g. 128 dimensions. This significantly
reduces the required memory / storage while maintaining nearly the same performance.

For dimensionality reduction, we compute embeddings for a large set of (representative) sentence. Then,
we use PCA to find e.g. 128 principle components of our vector space. This allows us to maintain
us much information as possible with only 128 dimensions.

PCA gives us a matrix that down-projects vectors to 128 dimensions. We use this matrix
and extend our original SentenceTransformer model with this linear downproject. Hence,
the new SentenceTransformer model will produce directly embeddings with 128 dimensions
without further changes needed.
"""

import logging
import random

import numpy as np
import torch
from sklearn.decomposition import PCA

from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

# Model for which we apply dimensionality reduction
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

# New size for the embeddings
new_dimension = 128

# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
stsb_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    name="sts-test",
)

logging.info("Original model performance:")
stsb_evaluator(model)

######## Reduce the embedding dimensions ########

train_dataset = load_dataset("sentence-transformers/all-nli", "pair-score", split="train")

nli_sentences = train_dataset["sentence1"] + train_dataset["sentence2"]
random.shuffle(nli_sentences)

# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(
    in_features=model.get_sentence_embedding_dimension(),
    out_features=new_dimension,
    bias=False,
    activation_function=torch.nn.Identity(),
)
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)

# Evaluate the model with the reduce embedding size
logging.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)


# If you like, you can store the model on disc by uncommenting the following line
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
model.save(f"{model_name}-128dim")

# You can then load the adapted model that produces 128 dimensional embeddings like this:
# model = SentenceTransformer('models/my-128dim-model')

# Or you can push the model to the Hugging Face Hub
# model.push_to_hub(f'{model_name}-128dim')