dimensionality_reduction.py 3.52 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
"""
The pre-trained models produce embeddings of size 512 - 1024. However, when storing a large
number of embeddings, this requires quite a lot of memory / storage.

In this example, we reduce the dimensionality of the embeddings to e.g. 128 dimensions. This significantly
reduces the required memory / storage while maintaining nearly the same performance.

For dimensionality reduction, we compute embeddings for a large set of (representative) sentence. Then,
we use PCA to find e.g. 128 principle components of our vector space. This allows us to maintain
us much information as possible with only 128 dimensions.

PCA gives us a matrix that down-projects vectors to 128 dimensions. We use this matrix
and extend our original SentenceTransformer model with this linear downproject. Hence,
the new SentenceTransformer model will produce directly embeddings with 128 dimensions
without further changes needed.
"""

import logging
import random
Rayyyyy's avatar
Rayyyyy committed
20

Rayyyyy's avatar
Rayyyyy committed
21
22
import numpy as np
import torch
Rayyyyy's avatar
Rayyyyy committed
23
from sklearn.decomposition import PCA
Rayyyyy's avatar
Rayyyyy committed
24

Rayyyyy's avatar
Rayyyyy committed
25
26
27
28
29
30
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Set the log level to INFO to get more information
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
Rayyyyy's avatar
Rayyyyy committed
31
32

# Model for which we apply dimensionality reduction
Rayyyyy's avatar
Rayyyyy committed
33
34
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
Rayyyyy's avatar
Rayyyyy committed
35
36
37
38
39
40

# New size for the embeddings
new_dimension = 128

# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
Rayyyyy's avatar
Rayyyyy committed
41
42
43
44
45
46
47
48
49
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
stsb_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_dataset["sentence1"],
    sentences2=test_dataset["sentence2"],
    scores=test_dataset["score"],
    name="sts-test",
)

logging.info("Original model performance:")
Rayyyyy's avatar
Rayyyyy committed
50
51
52
53
stsb_evaluator(model)

######## Reduce the embedding dimensions ########

Rayyyyy's avatar
Rayyyyy committed
54
train_dataset = load_dataset("sentence-transformers/all-nli", "pair-score", split="train")
Rayyyyy's avatar
Rayyyyy committed
55

Rayyyyy's avatar
Rayyyyy committed
56
nli_sentences = train_dataset["sentence1"] + train_dataset["sentence2"]
Rayyyyy's avatar
Rayyyyy committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
random.shuffle(nli_sentences)

# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for 20k random sentences from the AllNLI dataset
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(
    in_features=model.get_sentence_embedding_dimension(),
    out_features=new_dimension,
    bias=False,
    activation_function=torch.nn.Identity(),
)
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)

# Evaluate the model with the reduce embedding size
Rayyyyy's avatar
Rayyyyy committed
80
logging.info("Model with {} dimensions:".format(new_dimension))
Rayyyyy's avatar
Rayyyyy committed
81
82
83
84
stsb_evaluator(model)


# If you like, you can store the model on disc by uncommenting the following line
Rayyyyy's avatar
Rayyyyy committed
85
86
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
model.save(f"{model_name}-128dim")
Rayyyyy's avatar
Rayyyyy committed
87
88
89

# You can then load the adapted model that produces 128 dimensional embeddings like this:
# model = SentenceTransformer('models/my-128dim-model')
Rayyyyy's avatar
Rayyyyy committed
90
91
92

# Or you can push the model to the Hugging Face Hub
# model.push_to_hub(f'{model_name}-128dim')