MatryoshkaLoss.py 6.83 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
import random
import warnings
Rayyyyy's avatar
Rayyyyy committed
3
4
from typing import Any, Dict, Iterable, List, Optional, Union

Rayyyyy's avatar
Rayyyyy committed
5
import torch.nn.functional as F
Rayyyyy's avatar
Rayyyyy committed
6
7
from torch import Tensor, nn

Rayyyyy's avatar
Rayyyyy committed
8
from sentence_transformers import SentenceTransformer
Rayyyyy's avatar
Rayyyyy committed
9
from sentence_transformers.losses.CachedGISTEmbedLoss import CachedGISTEmbedLoss
Rayyyyy's avatar
Rayyyyy committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import CachedMultipleNegativesRankingLoss


class ForwardDecorator:
    def __init__(self, fn):
        self.fn = fn

        self.dim = None
        self.cache = []
        self.cache_dim = None
        self.idx = 0

    def set_dim(self, dim):
        self.dim = dim
        self.idx = 0

    def shrink(self, tensor: Tensor) -> Tensor:
Rayyyyy's avatar
Rayyyyy committed
27
28
29
30
31
        tensor_dim = tensor.shape[-1]
        if self.dim > tensor_dim:
            raise ValueError(
                f"Dimension {self.dim} in matryoshka_dims cannot be greater than the model's embedding dimension: {tensor_dim}"
            )
Rayyyyy's avatar
Rayyyyy committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
        tensor = tensor[..., : self.dim]
        tensor = F.normalize(tensor, p=2, dim=-1)
        return tensor

    def __call__(self, features):
        # Growing cache:
        if self.cache_dim is None or self.cache_dim == self.dim:
            output = self.fn(features)
            self.cache.append(output)
            self.cache_dim = self.dim
        # Using cache:
        else:
            output = self.cache[self.idx]
        output["token_embeddings"] = self.shrink(output["token_embeddings"])
        output["sentence_embedding"] = self.shrink(output["sentence_embedding"])
        self.idx += 1
        return output


class MatryoshkaLoss(nn.Module):
    def __init__(
        self,
        model: SentenceTransformer,
        loss: nn.Module,
        matryoshka_dims: List[int],
        matryoshka_weights: Optional[List[Union[float, int]]] = None,
        n_dims_per_step: int = -1,
    ) -> None:
        """
        The MatryoshkaLoss can be seen as a loss *modifier* that allows you to use other loss functions at various
        different embedding dimensions. This is useful for when you want to train a model where users have the option
        to lower the embedding dimension to improve their embedding comparison speed and costs.

Rayyyyy's avatar
Rayyyyy committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        Args:
            model: SentenceTransformer model
            loss: The loss function to be used, e.g.
                :class:`MultipleNegativesRankingLoss`,
                :class:`CoSENTLoss`, etc.
            matryoshka_dims: A list of embedding dimensions to be used
                for the loss function, e.g. [768, 512, 256, 128, 64].
            matryoshka_weights: A list of weights to be used for the
                loss function, e.g. [1, 1, 1, 1, 1]. If None, then the
                weights will be set to 1 for all dimensions.
            n_dims_per_step: The number of dimensions to use per step.
                If -1, then all dimensions are used. If > 0, then a
                random sample of n_dims_per_step dimensions are used per
                step. The default value is -1.
Rayyyyy's avatar
Rayyyyy committed
79
80
81
82
83
84

        References:
            - The concept was introduced in this paper: https://arxiv.org/abs/2205.13147
            - `Matryoshka Embeddings <../../examples/training/matryoshka/README.html>`_

        Requirements:
Rayyyyy's avatar
Rayyyyy committed
85
            1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss` or :class:`CachedGISTEmbedLoss`.
Rayyyyy's avatar
Rayyyyy committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

        Relations:
            - :class:`Matryoshka2dLoss` uses this loss in combination with :class:`AdaptiveLayerLoss` which allows for
                layer reduction for faster inference.

        Input:
            +---------------------------------------+--------+
            | Texts                                 | Labels |
            +=======================================+========+
            | any                                   | any    |
            +---------------------------------------+--------+

        Example:
            ::

                from sentence_transformers import SentenceTransformer, losses, InputExample
                from torch.utils.data import DataLoader

Rayyyyy's avatar
Rayyyyy committed
104
                model = SentenceTransformer("microsoft/mpnet-base")
Rayyyyy's avatar
Rayyyyy committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
                train_examples = [
                    InputExample(texts=['Anchor 1', 'Positive 1']),
                    InputExample(texts=['Anchor 2', 'Positive 2']),
                ]
                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
                train_loss = losses.MultipleNegativesRankingLoss(model=model)
                train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
                model.fit(
                    [(train_dataloader, train_loss)],
                    epochs=10,
                )
        """
        super().__init__()
        self.model = model
        self.loss = loss
        if isinstance(loss, CachedMultipleNegativesRankingLoss):
            warnings.warn("MatryoshkaLoss is not compatible with CachedMultipleNegativesRankingLoss.", stacklevel=2)
Rayyyyy's avatar
Rayyyyy committed
122
123
        if isinstance(loss, CachedGISTEmbedLoss):
            warnings.warn("MatryoshkaLoss is not compatible with CachedGISTEmbedLoss.", stacklevel=2)
Rayyyyy's avatar
Rayyyyy committed
124
125
126
127
128
129
130
131
        self.matryoshka_dims = matryoshka_dims
        if matryoshka_weights is None:
            matryoshka_weights = [1] * len(matryoshka_dims)
        self.matryoshka_weights = matryoshka_weights
        self.n_dims_per_step = n_dims_per_step

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor:
        original_forward = self.model.forward
Rayyyyy's avatar
Rayyyyy committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
        try:
            decorated_forward = ForwardDecorator(original_forward)
            self.model.forward = decorated_forward

            dim_indices = range(len(self.matryoshka_dims))
            if self.n_dims_per_step > 0 and self.n_dims_per_step < len(dim_indices):
                dim_indices = random.sample(dim_indices, self.n_dims_per_step)

            loss = 0.0
            for idx in dim_indices:
                dim = self.matryoshka_dims[idx]
                weight = self.matryoshka_weights[idx]
                decorated_forward.set_dim(dim)
                loss += weight * self.loss(sentence_features, labels)
        finally:
            self.model.forward = original_forward
Rayyyyy's avatar
Rayyyyy committed
148
149
150
151
152
153
154
155
156
        return loss

    def get_config_dict(self) -> Dict[str, Any]:
        return {
            "loss": self.loss.__class__.__name__,
            "matryoshka_dims": self.matryoshka_dims,
            "matryoshka_weights": self.matryoshka_weights,
            "n_dims_per_step": self.n_dims_per_step,
        }
Rayyyyy's avatar
Rayyyyy committed
157
158
159
160
161
162
163
164
165
166
167
168
169

    @property
    def citation(self) -> str:
        return """
@misc{kusupati2024matryoshka,
    title={Matryoshka Representation Learning}, 
    author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
    year={2024},
    eprint={2205.13147},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}
"""