Commit 24db6dab authored by Rayyyyy's avatar Rayyyyy
Browse files

first add

parents
Pipeline #850 failed with stages
in 0 seconds
from sentence_transformers import losses, SentenceTransformer, util
class AnglELoss(losses.CoSENTLoss):
def __init__(self, model: SentenceTransformer, scale: float = 20.0):
"""
This class implements AnglE (Angle Optimized) loss.
This is a modification of :class:`CoSENTLoss`, designed to address the following issue:
The cosine function's gradient approaches 0 as the wave approaches the top or bottom of its form.
This can hinder the optimization process, so AnglE proposes to instead optimize the angle difference
in complex space in order to mitigate this effect.
It expects that each of the InputExamples consists of a pair of texts and a float valued label, representing
the expected similarity score between the pair.
It computes the following loss function:
``loss = logsum(1+exp(s(k,l)-s(i,j))+exp...)``, where ``(i,j)`` and ``(k,l)`` are any of the input pairs in the
batch such that the expected similarity of ``(i,j)`` is greater than ``(k,l)``. The summation is over all possible
pairs of input pairs in the batch that match this condition. This is the same as CoSENTLoss, with a different
similarity function.
:param model: SentenceTransformerModel
:param scale: Output of similarity function is multiplied by scale value. Represents the inverse temperature.
References:
- For further details, see: https://arxiv.org/abs/2309.12871v1
Requirements:
- Sentence pairs with corresponding similarity scores in range of the similarity function. Default is [-1,1].
Relations:
- :class:`CoSENTLoss` is AnglELoss with ``pairwise_cos_sim`` as the metric, rather than ``pairwise_angle_sim``.
- :class:`CosineSimilarityLoss` seems to produce a weaker training signal than ``CoSENTLoss`` or ``AnglELoss``.
Inputs:
+--------------------------------+------------------------+
| Texts | Labels |
+================================+========================+
| (sentence_A, sentence_B) pairs | float similarity score |
+--------------------------------+------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
model = SentenceTransformer('bert-base-uncased')
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=1.0),
InputExample(texts=['My third sentence', 'Unrelated sentence'], label=0.3)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.AnglELoss(model=model)
"""
super().__init__(model, scale, similarity_fct=util.pairwise_angle_sim)
from torch import nn, Tensor
from typing import Iterable, Dict
from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction
from sentence_transformers.SentenceTransformer import SentenceTransformer
class BatchAllTripletLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance,
margin: float = 5,
):
"""
BatchAllTripletLoss takes a batch with (sentence, label) pairs and computes the loss for all possible, valid
triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels
must be integers, with same label indicating sentences from the same class. Your train dataset
must contain at least 2 examples per label class.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrics that can be used.
:param margin: Negative samples should be at least margin further apart from the anchor than the positive.
References:
* Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
* Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
* Blog post: https://omoindrot.github.io/triplet-loss
Requirements:
1. Each sentence must be labeled with a class.
2. Your dataset must contain at least 2 examples per labels class.
Relations:
* :class:`BatchHardTripletLoss` uses only the hardest positive and negative samples, rather than all possible, valid triplets.
* :class:`BatchHardSoftMarginTripletLoss` uses only the hardest positive and negative samples, rather than all possible, valid triplets.
Also, it does not require setting a margin.
* :class:`BatchSemiHardTripletLoss` uses only semi-hard triplets, valid triplets, rather than all possible, valid triplets.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | class |
+------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
InputExample(texts=['Sentence from class 0'], label=0),
InputExample(texts=['Another sentence from class 0'], label=0),
InputExample(texts=['Sentence from class 1'], label=1),
InputExample(texts=['Sentence from class 2'], label=2),
]
train_batch_size = 2
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchAllTripletLoss(model=model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(BatchAllTripletLoss, self).__init__()
self.sentence_embedder = model
self.triplet_margin = margin
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"]
return self.batch_all_triplet_loss(labels, rep)
def batch_all_triplet_loss(self, labels, embeddings):
"""Build the triplet loss over a batch of embeddings.
We generate all the valid triplets and average the loss over the positive ones.
Args:
labels: labels of the batch, of size (batch_size,)
embeddings: tensor of shape (batch_size, embed_dim)
margin: margin for triplet loss
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
Label_Sentence_Triplet: scalar tensor containing the triplet loss
"""
# Get the pairwise distance matrix
pairwise_dist = self.distance_metric(embeddings)
anchor_positive_dist = pairwise_dist.unsqueeze(2)
anchor_negative_dist = pairwise_dist.unsqueeze(1)
# Compute a 3D tensor of size (batch_size, batch_size, batch_size)
# triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k
# Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1)
# and the 2nd (batch_size, 1, batch_size)
triplet_loss = anchor_positive_dist - anchor_negative_dist + self.triplet_margin
# Put to zero the invalid triplets
# (where label(a) != label(p) or label(n) == label(a) or a == p)
mask = BatchHardTripletLoss.get_triplet_mask(labels)
triplet_loss = mask.float() * triplet_loss
# Remove negative losses (i.e. the easy triplets)
triplet_loss[triplet_loss < 0] = 0
# Count number of positive triplets (where triplet_loss > 0)
valid_triplets = triplet_loss[triplet_loss > 1e-16]
num_positive_triplets = valid_triplets.size(0)
# num_valid_triplets = mask.sum()
# fraction_positive_triplets = num_positive_triplets / (num_valid_triplets.float() + 1e-16)
# Get final mean triplet loss over the positive valid triplets
triplet_loss = triplet_loss.sum() / (num_positive_triplets + 1e-16)
return triplet_loss
import torch
from torch import Tensor
from typing import Iterable, Dict
from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction
from sentence_transformers.SentenceTransformer import SentenceTransformer
class BatchHardSoftMarginTripletLoss(BatchHardTripletLoss):
def __init__(
self, model: SentenceTransformer, distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance
):
"""
BatchHardSoftMarginTripletLoss takes a batch with (sentence, label) pairs and computes the loss for all possible, valid
triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels
must be integers, with same label indicating sentences from the same class. Your train dataset
must contain at least 2 examples per label class. This soft-margin variant does not require setting a margin.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrics that can be used.
Definitions:
:Easy triplets: Triplets which have a loss of 0 because
``distance(anchor, positive) + margin < distance(anchor, negative)``.
:Hard triplets: Triplets where the negative is closer to the anchor than the positive, i.e.,
``distance(anchor, negative) < distance(anchor, positive)``.
:Semi-hard triplets: Triplets where the negative is not closer to the anchor than the positive, but which
still have a positive loss, i.e., ``distance(anchor, positive) < distance(anchor, negative) + margin``.
References:
* Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
* Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
* Blog post: https://omoindrot.github.io/triplet-loss
Requirements:
1. Each sentence must be labeled with a class.
2. Your dataset must contain at least 2 examples per labels class.
3. Your dataset should contain hard positives and negatives.
Relations:
* :class:`BatchHardTripletLoss` uses a user-specified margin, while this loss does not require setting a margin.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | class |
+------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
InputExample(texts=['Sentence from class 0'], label=0),
InputExample(texts=['Another sentence from class 0'], label=0),
InputExample(texts=['Sentence from class 1'], label=1),
InputExample(texts=['Sentence from class 2'], label=2)
]
train_batch_size = 2
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(BatchHardSoftMarginTripletLoss, self).__init__(model)
self.sentence_embedder = model
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"]
return self.batch_hard_triplet_soft_margin_loss(labels, rep)
# Hard Triplet Loss with Soft Margin
# Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
def batch_hard_triplet_soft_margin_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor:
"""Build the triplet loss over a batch of embeddings.
For each anchor, we get the hardest positive and hardest negative to form a triplet.
Args:
labels: labels of the batch, of size (batch_size,)
embeddings: tensor of shape (batch_size, embed_dim)
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
Label_Sentence_Triplet: scalar tensor containing the triplet loss
"""
# Get the pairwise distance matrix
pairwise_dist = self.distance_metric(embeddings)
# For each anchor, get the hardest positive
# First, we need to get a mask for every valid positive (they should have same label)
mask_anchor_positive = BatchHardTripletLoss.get_anchor_positive_triplet_mask(labels).float()
# We put to 0 any element where (a, p) is not valid (valid if a != p and label(a) == label(p))
anchor_positive_dist = mask_anchor_positive * pairwise_dist
# shape (batch_size, 1)
hardest_positive_dist, _ = anchor_positive_dist.max(1, keepdim=True)
# For each anchor, get the hardest negative
# First, we need to get a mask for every valid negative (they should have different labels)
mask_anchor_negative = BatchHardTripletLoss.get_anchor_negative_triplet_mask(labels).float()
# We add the maximum value in each row to the invalid negatives (label(a) == label(n))
max_anchor_negative_dist, _ = pairwise_dist.max(1, keepdim=True)
anchor_negative_dist = pairwise_dist + max_anchor_negative_dist * (1.0 - mask_anchor_negative)
# shape (batch_size,)
hardest_negative_dist, _ = anchor_negative_dist.min(1, keepdim=True)
# Combine biggest d(a, p) and smallest d(a, n) into final triplet loss with soft margin
# tl = hardest_positive_dist - hardest_negative_dist + margin
# tl[tl < 0] = 0
tl = torch.log1p(torch.exp(hardest_positive_dist - hardest_negative_dist))
triplet_loss = tl.mean()
return triplet_loss
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from sentence_transformers import util
from sentence_transformers.SentenceTransformer import SentenceTransformer
class BatchHardTripletLossDistanceFunction:
"""
This class defines distance functions, that can be used with Batch[All/Hard/SemiHard]TripletLoss
"""
@staticmethod
def cosine_distance(embeddings):
"""
Compute the 2D matrix of cosine distances (1-cosine_similarity) between all embeddings.
"""
return 1 - util.pytorch_cos_sim(embeddings, embeddings)
@staticmethod
def eucledian_distance(embeddings, squared=False):
"""
Compute the 2D matrix of eucledian distances between all the embeddings.
Args:
embeddings: tensor of shape (batch_size, embed_dim)
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
pairwise_distances: tensor of shape (batch_size, batch_size)
"""
dot_product = torch.matmul(embeddings, embeddings.t())
# Get squared L2 norm for each embedding. We can just take the diagonal of `dot_product`.
# This also provides more numerical stability (the diagonal of the result will be exactly 0).
# shape (batch_size,)
square_norm = torch.diag(dot_product)
# Compute the pairwise distance matrix as we have:
# ||a - b||^2 = ||a||^2 - 2 <a, b> + ||b||^2
# shape (batch_size, batch_size)
distances = square_norm.unsqueeze(0) - 2.0 * dot_product + square_norm.unsqueeze(1)
# Because of computation errors, some distances might be negative so we put everything >= 0.0
distances[distances < 0] = 0
if not squared:
# Because the gradient of sqrt is infinite when distances == 0.0 (ex: on the diagonal)
# we need to add a small epsilon where distances == 0.0
mask = distances.eq(0).float()
distances = distances + mask * 1e-16
distances = (1.0 - mask) * torch.sqrt(distances)
return distances
class BatchHardTripletLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance,
margin: float = 5,
):
"""
BatchHardTripletLoss takes a batch with (sentence, label) pairs and computes the loss for all possible, valid
triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. It then looks
for the hardest positive and the hardest negatives.
The labels must be integers, with same label indicating sentences from the same class. Your train dataset
must contain at least 2 examples per label class.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrics that can be used
:param margin: Negative samples should be at least margin further apart from the anchor than the positive.
Definitions:
:Easy triplets: Triplets which have a loss of 0 because
``distance(anchor, positive) + margin < distance(anchor, negative)``.
:Hard triplets: Triplets where the negative is closer to the anchor than the positive, i.e.,
``distance(anchor, negative) < distance(anchor, positive)``.
:Semi-hard triplets: Triplets where the negative is not closer to the anchor than the positive, but which
still have a positive loss, i.e., ``distance(anchor, positive) < distance(anchor, negative) + margin``.
References:
* Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
* Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
* Blog post: https://omoindrot.github.io/triplet-loss
Requirements:
1. Each sentence must be labeled with a class.
2. Your dataset must contain at least 2 examples per labels class.
3. Your dataset should contain hard positives and negatives.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | class |
+------------------+--------+
Relations:
* :class:`BatchAllTripletLoss` uses all possible, valid triplets, rather than only the hardest positive and negative samples.
* :class:`BatchSemiHardTripletLoss` uses only semi-hard triplets, valid triplets, rather than only the hardest positive and negative samples.
* :class:`BatchHardSoftMarginTripletLoss` does not require setting a margin, while this loss does.
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
InputExample(texts=['Sentence from class 0'], label=0),
InputExample(texts=['Another sentence from class 0'], label=0),
InputExample(texts=['Sentence from class 1'], label=1),
InputExample(texts=['Sentence from class 2'], label=2)
]
train_batch_size = 2
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchHardTripletLoss(model=model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(BatchHardTripletLoss, self).__init__()
self.sentence_embedder = model
self.triplet_margin = margin
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"]
return self.batch_hard_triplet_loss(labels, rep)
# Hard Triplet Loss
# Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
# Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
# Blog post: https://omoindrot.github.io/triplet-loss
def batch_hard_triplet_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor:
"""Build the triplet loss over a batch of embeddings.
For each anchor, we get the hardest positive and hardest negative to form a triplet.
Args:
labels: labels of the batch, of size (batch_size,)
embeddings: tensor of shape (batch_size, embed_dim)
margin: margin for triplet loss
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
Label_Sentence_Triplet: scalar tensor containing the triplet loss
"""
# Get the pairwise distance matrix
pairwise_dist = self.distance_metric(embeddings)
# For each anchor, get the hardest positive
# First, we need to get a mask for every valid positive (they should have same label)
mask_anchor_positive = BatchHardTripletLoss.get_anchor_positive_triplet_mask(labels).float()
# We put to 0 any element where (a, p) is not valid (valid if a != p and label(a) == label(p))
anchor_positive_dist = mask_anchor_positive * pairwise_dist
# shape (batch_size, 1)
hardest_positive_dist, _ = anchor_positive_dist.max(1, keepdim=True)
# For each anchor, get the hardest negative
# First, we need to get a mask for every valid negative (they should have different labels)
mask_anchor_negative = BatchHardTripletLoss.get_anchor_negative_triplet_mask(labels).float()
# We add the maximum value in each row to the invalid negatives (label(a) == label(n))
max_anchor_negative_dist, _ = pairwise_dist.max(1, keepdim=True)
anchor_negative_dist = pairwise_dist + max_anchor_negative_dist * (1.0 - mask_anchor_negative)
# shape (batch_size,)
hardest_negative_dist, _ = anchor_negative_dist.min(1, keepdim=True)
# Combine biggest d(a, p) and smallest d(a, n) into final triplet loss
tl = hardest_positive_dist - hardest_negative_dist + self.triplet_margin
tl[tl < 0] = 0
triplet_loss = tl.mean()
return triplet_loss
@staticmethod
def get_triplet_mask(labels):
"""Return a 3D mask where mask[a, p, n] is True iff the triplet (a, p, n) is valid.
A triplet (i, j, k) is valid if:
- i, j, k are distinct
- labels[i] == labels[j] and labels[i] != labels[k]
Args:
labels: tf.int32 `Tensor` with shape [batch_size]
"""
# Check that i, j and k are distinct
indices_equal = torch.eye(labels.size(0), device=labels.device).bool()
indices_not_equal = ~indices_equal
i_not_equal_j = indices_not_equal.unsqueeze(2)
i_not_equal_k = indices_not_equal.unsqueeze(1)
j_not_equal_k = indices_not_equal.unsqueeze(0)
distinct_indices = (i_not_equal_j & i_not_equal_k) & j_not_equal_k
label_equal = labels.unsqueeze(0) == labels.unsqueeze(1)
i_equal_j = label_equal.unsqueeze(2)
i_equal_k = label_equal.unsqueeze(1)
valid_labels = ~i_equal_k & i_equal_j
return valid_labels & distinct_indices
@staticmethod
def get_anchor_positive_triplet_mask(labels):
"""Return a 2D mask where mask[a, p] is True iff a and p are distinct and have same label.
Args:
labels: tf.int32 `Tensor` with shape [batch_size]
Returns:
mask: tf.bool `Tensor` with shape [batch_size, batch_size]
"""
# Check that i and j are distinct
indices_equal = torch.eye(labels.size(0), device=labels.device).bool()
indices_not_equal = ~indices_equal
# Check if labels[i] == labels[j]
# Uses broadcasting where the 1st argument has shape (1, batch_size) and the 2nd (batch_size, 1)
labels_equal = labels.unsqueeze(0) == labels.unsqueeze(1)
return labels_equal & indices_not_equal
@staticmethod
def get_anchor_negative_triplet_mask(labels):
"""Return a 2D mask where mask[a, n] is True iff a and n have distinct labels.
Args:
labels: tf.int32 `Tensor` with shape [batch_size]
Returns:
mask: tf.bool `Tensor` with shape [batch_size, batch_size]
"""
# Check if labels[i] != labels[k]
# Uses broadcasting where the 1st argument has shape (1, batch_size) and the 2nd (batch_size, 1)
return ~(labels.unsqueeze(0) == labels.unsqueeze(1))
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from .BatchHardTripletLoss import BatchHardTripletLossDistanceFunction
from sentence_transformers.SentenceTransformer import SentenceTransformer
class BatchSemiHardTripletLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance,
margin: float = 5,
):
"""
BatchSemiHardTripletLoss takes a batch with (label, sentence) pairs and computes the loss for all possible, valid
triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. It then looks
for the semi hard positives and negatives.
The labels must be integers, with same label indicating sentences from the same class. Your train dataset
must contain at least 2 examples per label class.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrics that can be used
:param margin: Negative samples should be at least margin further apart from the anchor than the positive.
Definitions:
:Easy triplets: Triplets which have a loss of 0 because
``distance(anchor, positive) + margin < distance(anchor, negative)``.
:Hard triplets: Triplets where the negative is closer to the anchor than the positive, i.e.,
``distance(anchor, negative) < distance(anchor, positive)``.
:Semi-hard triplets: Triplets where the negative is not closer to the anchor than the positive, but which
still have a positive loss, i.e., ``distance(anchor, positive) < distance(anchor, negative) + margin``.
References:
* Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
* Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
* Blog post: https://omoindrot.github.io/triplet-loss
Requirements:
1. Each sentence must be labeled with a class.
2. Your dataset must contain at least 2 examples per labels class.
3. Your dataset should contain semi hard positives and negatives.
Relations:
* :class:`BatchHardTripletLoss` uses only the hardest positive and negative samples, rather than only semi hard positive and negatives.
* :class:`BatchAllTripletLoss` uses all possible, valid triplets, rather than only semi hard positive and negatives.
* :class:`BatchHardSoftMarginTripletLoss` uses only the hardest positive and negative samples, rather than only semi hard positive and negatives.
Also, it does not require setting a margin.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | class |
+------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
InputExample(texts=['Sentence from class 0'], label=0),
InputExample(texts=['Another sentence from class 0'], label=0),
InputExample(texts=['Sentence from class 1'], label=1),
InputExample(texts=['Sentence from class 2'], label=2)
]
train_batch_size = 2
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchSemiHardTripletLoss(model=model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(BatchSemiHardTripletLoss, self).__init__()
self.sentence_embedder = model
self.margin = margin
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"]
return self.batch_semi_hard_triplet_loss(labels, rep)
# Semi-Hard Triplet Loss
# Based on: https://github.com/tensorflow/addons/blob/master/tensorflow_addons/losses/triplet.py#L71
# Paper: FaceNet: A Unified Embedding for Face Recognition and Clustering: https://arxiv.org/pdf/1503.03832.pdf
def batch_semi_hard_triplet_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor:
"""Build the triplet loss over a batch of embeddings.
We generate all the valid triplets and average the loss over the positive ones.
Args:
labels: labels of the batch, of size (batch_size,)
embeddings: tensor of shape (batch_size, embed_dim)
margin: margin for triplet loss
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
Label_Sentence_Triplet: scalar tensor containing the triplet loss
"""
labels = labels.unsqueeze(1)
pdist_matrix = self.distance_metric(embeddings)
adjacency = labels == labels.t()
adjacency_not = ~adjacency
batch_size = torch.numel(labels)
pdist_matrix_tile = pdist_matrix.repeat([batch_size, 1])
mask = adjacency_not.repeat([batch_size, 1]) & (pdist_matrix_tile > torch.reshape(pdist_matrix.t(), [-1, 1]))
mask_final = torch.reshape(torch.sum(mask, 1, keepdims=True) > 0.0, [batch_size, batch_size])
mask_final = mask_final.t()
negatives_outside = torch.reshape(
BatchSemiHardTripletLoss._masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size]
)
negatives_outside = negatives_outside.t()
negatives_inside = BatchSemiHardTripletLoss._masked_maximum(pdist_matrix, adjacency_not)
negatives_inside = negatives_inside.repeat([1, batch_size])
semi_hard_negatives = torch.where(mask_final, negatives_outside, negatives_inside)
loss_mat = (pdist_matrix - semi_hard_negatives) + self.margin
mask_positives = adjacency.float().to(labels.device) - torch.eye(batch_size, device=labels.device)
mask_positives = mask_positives.to(labels.device)
num_positives = torch.sum(mask_positives)
triplet_loss = (
torch.sum(torch.max(loss_mat * mask_positives, torch.tensor([0.0], device=labels.device))) / num_positives
)
return triplet_loss
@staticmethod
def _masked_minimum(data, mask, dim=1):
axis_maximums, _ = data.max(dim, keepdims=True)
masked_minimums = (data - axis_maximums) * mask
masked_minimums, _ = masked_minimums.min(dim, keepdims=True)
masked_minimums += axis_maximums
return masked_minimums
@staticmethod
def _masked_maximum(data, mask, dim=1):
axis_minimums, _ = data.min(dim, keepdims=True)
masked_maximums = (data - axis_minimums) * mask
masked_maximums, _ = masked_maximums.max(dim, keepdims=True)
masked_maximums += axis_minimums
return masked_maximums
from __future__ import annotations
from contextlib import nullcontext
from functools import partial
import torch
from torch import nn, Tensor
from torch.utils.checkpoint import get_device_states, set_device_states
from typing import Iterable, Dict, Iterator, List, Optional, Tuple
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import tqdm
class RandContext:
"""
Random-state context manager class. Reference: https://github.com/luyug/GradCache.
This class will back up the pytorch's random state during initialization. Then when the context is activated,
the class will set up the random state with the backed-up one.
"""
def __init__(self, *tensors):
self.fwd_cpu_state = torch.get_rng_state()
self.fwd_gpu_devices, self.fwd_gpu_states = get_device_states(*tensors)
def __enter__(self):
self._fork = torch.random.fork_rng(devices=self.fwd_gpu_devices, enabled=True)
self._fork.__enter__()
torch.set_rng_state(self.fwd_cpu_state)
set_device_states(self.fwd_gpu_devices, self.fwd_gpu_states)
def __exit__(self, exc_type, exc_val, exc_tb):
self._fork.__exit__(exc_type, exc_val, exc_tb)
self._fork = None
def _backward_hook(
grad_output: Tensor,
sentence_features: Iterable[Dict[str, Tensor]],
loss_obj: CachedMultipleNegativesRankingLoss,
):
"""A backward hook to backpropagate the cached gradients mini-batch by mini-batch."""
assert loss_obj.cache is not None
assert loss_obj.random_states is not None
with torch.enable_grad():
for sentence_feature, grad, random_states in zip(sentence_features, loss_obj.cache, loss_obj.random_states):
for (reps_mb, _), grad_mb in zip(
loss_obj.embed_minibatch_iter(
sentence_feature=sentence_feature,
with_grad=True,
copy_random_state=False,
random_states=random_states,
),
grad,
):
surrogate = torch.dot(reps_mb.flatten(), grad_mb.flatten()) * grad_output
surrogate.backward()
class CachedMultipleNegativesRankingLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
scale: float = 20.0,
similarity_fct: callable[[Tensor, Tensor], Tensor] = util.cos_sim,
mini_batch_size: int = 32,
show_progress_bar: bool = False,
):
"""
Boosted version of MultipleNegativesRankingLoss (https://arxiv.org/pdf/1705.00652.pdf) by GradCache (https://arxiv.org/pdf/2101.06983.pdf).
Constrastive learning (here our MNRL loss) with in-batch negatives is usually hard to work with large batch sizes due to (GPU) memory limitation.
Even with batch-scaling methods like gradient-scaling, it cannot work either. This is because the in-batch negatives make the data points within
the same batch non-independent and thus the batch cannot be broke down into mini-batches. GradCache is a smart way to solve this problem.
It achieves the goal by dividing the computation into two stages of embedding and loss calculation, which both can be scaled by mini-batches.
As a result, memory of constant size (e.g. that works with batch size = 32) can now process much larger batches (e.g. 65536).
In detail:
(1) It first does a quick embedding step without gradients/computation graphs to get all the embeddings;
(2) Calculate the loss, backward up to the embeddings and cache the gradients wrt. to the embeddings;
(3) A 2nd embedding step with gradients/computation graphs and connect the cached gradients into the backward chain.
Notes: All steps are done with mini-batches. In the original implementation of GradCache, (2) is not done in mini-batches and
requires a lot memory when batch size large. One drawback is about the speed. GradCache will sacrifice around 20% computation time according to the paper.
:param model: SentenceTransformer model
:param scale: Output of similarity function is multiplied by scale value
:param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
References:
- Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4: https://arxiv.org/pdf/1705.00652.pdf
- Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup: https://arxiv.org/pdf/2101.06983.pdf
Requirements:
1. (anchor, positive) pairs or (anchor, positive, negative pairs)
2. Should be used with large batch sizes for superior performance, but has slower training time than :class:`MultipleNegativesRankingLoss`
Relations:
- Equivalent to :class:`MultipleNegativesRankingLoss`, but with caching that allows for much higher batch sizes
(and thus better performance) without extra memory usage. This loss also trains roughly 2x to 2.4x slower than
:class:`MultipleNegativesRankingLoss`.
Inputs:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| (anchor, positive) pairs | none |
+---------------------------------------+--------+
| (anchor, positive, negative) triplets | none |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-uncased')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=1024) # Here we can try much larger batch sizes!
train_loss = losses.CachedMultipleNegativesRankingLoss(model=model, mini_batch_size = 32)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(CachedMultipleNegativesRankingLoss, self).__init__()
self.model = model
self.scale = scale
self.similarity_fct = similarity_fct
self.cross_entropy_loss = nn.CrossEntropyLoss()
self.mini_batch_size = mini_batch_size
self.cache: Optional[List[List[Tensor]]] = None
self.random_states: Optional[List[List[RandContext]]] = None
self.show_progress_bar = show_progress_bar
def embed_minibatch(
self,
sentence_feature: Dict[str, Tensor],
begin: int,
end: int,
with_grad: bool,
copy_random_state: bool,
random_state: Optional[RandContext] = None,
) -> Tuple[Tensor, Optional[RandContext]]:
"""Do forward pass on a minibatch of the input features and return corresponding embeddings."""
grad_context = nullcontext if with_grad else torch.no_grad
random_state_context = nullcontext() if random_state is None else random_state
sentence_feature_minibatch = {k: v[begin:end] for k, v in sentence_feature.items()}
with random_state_context:
with grad_context():
random_state = RandContext(*sentence_feature_minibatch.values()) if copy_random_state else None
reps = self.model(sentence_feature_minibatch)["sentence_embedding"] # (mbsz, hdim)
return reps, random_state
def embed_minibatch_iter(
self,
sentence_feature: Dict[str, Tensor],
with_grad: bool,
copy_random_state: bool,
random_states: Optional[List[RandContext]] = None,
) -> Iterator[Tuple[Tensor, Optional[RandContext]]]:
"""Do forward pass on all the minibatches of the input features and yield corresponding embeddings."""
input_ids: Tensor = sentence_feature["input_ids"]
bsz, _ = input_ids.shape
for i, b in enumerate(
tqdm.trange(
0,
bsz,
self.mini_batch_size,
desc="Embed mini-batches",
disable=not self.show_progress_bar,
)
):
e = b + self.mini_batch_size
reps, random_state = self.embed_minibatch(
sentence_feature=sentence_feature,
begin=b,
end=e,
with_grad=with_grad,
copy_random_state=copy_random_state,
random_state=None if random_states is None else random_states[i],
)
yield reps, random_state # reps: (mbsz, hdim)
def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]]) -> Tensor:
"""Calculate the cross-entropy loss and cache the gradients wrt. the embeddings."""
embeddings_a = torch.cat(reps[0]) # (bsz, hdim)
embeddings_b = torch.cat([torch.cat(r) for r in reps[1:]]) # ((1 + nneg) * bsz, hdim)
batch_size = len(embeddings_a)
labels = torch.tensor(
range(batch_size), dtype=torch.long, device=embeddings_a.device
) # (bsz, (1 + nneg) * bsz) Example a[i] should match with b[i]
losses: List[torch.Tensor] = []
for b in tqdm.trange(
0,
batch_size,
self.mini_batch_size,
desc="Preparing caches",
disable=not self.show_progress_bar,
):
e = b + self.mini_batch_size
scores: Tensor = self.similarity_fct(embeddings_a[b:e], embeddings_b) * self.scale
loss_mbatch: torch.Tensor = self.cross_entropy_loss(scores, labels[b:e]) * len(scores) / batch_size
loss_mbatch.backward()
losses.append(loss_mbatch.detach())
loss = sum(losses).requires_grad_()
self.cache = [[r.grad for r in rs] for rs in reps] # e.g. 3 * bsz/mbsz * (mbsz, hdim)
return loss
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor:
# Step (1): A quick embedding step without gradients/computation graphs to get all the embeddings
reps = []
self.random_states = [] # Copy random states to guarantee exact reproduction of the embeddings during the second forward pass, i.e. step (3)
for sentence_feature in sentence_features:
reps_mbs = []
random_state_mbs = []
for reps_mb, random_state in self.embed_minibatch_iter(
sentence_feature=sentence_feature,
with_grad=False,
copy_random_state=True,
):
reps_mbs.append(reps_mb.detach().requires_grad_())
random_state_mbs.append(random_state)
reps.append(reps_mbs)
self.random_states.append(random_state_mbs)
# Step (2): Calculate the loss, backward up to the embeddings and cache the gradients wrt. to the embeddings
loss = self.calculate_loss_and_cache_gradients(reps)
# Step (3): A 2nd embedding step with gradients/computation graphs and connect the cached gradients into the backward chain
loss.register_hook(partial(_backward_hook, sentence_features=sentence_features, loss_obj=self))
return loss
def get_config_dict(self):
return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__}
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from ..SentenceTransformer import SentenceTransformer
from .. import util
class CoSENTLoss(nn.Module):
def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct=util.pairwise_cos_sim):
"""
This class implements CoSENT (Cosine Sentence) loss.
It expects that each of the InputExamples consists of a pair of texts and a float valued label, representing
the expected similarity score between the pair.
It computes the following loss function:
``loss = logsum(1+exp(s(k,l)-s(i,j))+exp...)``, where ``(i,j)`` and ``(k,l)`` are any of the input pairs in the
batch such that the expected similarity of ``(i,j)`` is greater than ``(k,l)``. The summation is over all possible
pairs of input pairs in the batch that match this condition.
Anecdotal experiments show that this loss function produces a more powerful training signal than :class:`CosineSimilarityLoss`,
resulting in faster convergence and a final model with superior performance. Consequently, CoSENTLoss may be used
as a drop-in replacement for :class:`CosineSimilarityLoss` in any training script.
:param model: SentenceTransformerModel
:param similarity_fct: Function to compute the PAIRWISE similarity between embeddings. Default is ``util.pairwise_cos_sim``.
:param scale: Output of similarity function is multiplied by scale value. Represents the inverse temperature.
References:
- For further details, see: https://kexue.fm/archives/8847
Requirements:
- Sentence pairs with corresponding similarity scores in range of the similarity function. Default is [-1,1].
Relations:
- :class:`AnglELoss` is CoSENTLoss with ``pairwise_angle_sim`` as the metric, rather than ``pairwise_cos_sim``.
- :class:`CosineSimilarityLoss` seems to produce a weaker training signal than CoSENTLoss. In our experiments, CoSENTLoss is recommended.
Inputs:
+--------------------------------+------------------------+
| Texts | Labels |
+================================+========================+
| (sentence_A, sentence_B) pairs | float similarity score |
+--------------------------------+------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
model = SentenceTransformer('bert-base-uncased')
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=1.0),
InputExample(texts=['My third sentence', 'Unrelated sentence'], label=0.3)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CoSENTLoss(model=model)
"""
super(CoSENTLoss, self).__init__()
self.model = model
self.similarity_fct = similarity_fct
self.scale = scale
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
scores = self.similarity_fct(embeddings[0], embeddings[1])
scores = scores * self.scale
scores = scores[:, None] - scores[None, :]
# label matrix indicating which pairs are relevant
labels = labels[:, None] < labels[None, :]
labels = labels.float()
# mask out irrelevant pairs so they are negligible after exp()
scores = scores - (1 - labels) * 1e12
# append a zero as e^0 = 1
scores = torch.cat((torch.zeros(1).to(scores.device), scores.view(-1)), dim=0)
loss = torch.logsumexp(scores, dim=0)
return loss
def get_config_dict(self):
return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__}
from enum import Enum
from typing import Iterable, Dict
import torch.nn.functional as F
from torch import nn, Tensor
from sentence_transformers.SentenceTransformer import SentenceTransformer
class SiameseDistanceMetric(Enum):
"""
The metric for the contrastive loss
"""
EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
COSINE_DISTANCE = lambda x, y: 1 - F.cosine_similarity(x, y)
class ContrastiveLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
distance_metric=SiameseDistanceMetric.COSINE_DISTANCE,
margin: float = 0.5,
size_average: bool = True,
):
"""
Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the
two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
:param margin: Negative samples (label == 0) should have a distance of at least the margin value.
:param size_average: Average by the size of the mini-batch.
References:
* Further information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
* `Training Examples > Quora Duplicate Questions <../../examples/training/quora_duplicate_questions/README.html>`_
Requirements:
1. (anchor, positive/negative) pairs
Relations:
- :class:`OnlineContrastiveLoss` is similar, but uses hard positive and hard negative pairs.
It often yields better results.
Inputs:
+-----------------------------------------------+------------------------------+
| Texts | Labels |
+===============================================+==============================+
| (anchor, positive/negative) pairs | 1 if positive, 0 if negative |
+-----------------------------------------------+------------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.ContrastiveLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(ContrastiveLoss, self).__init__()
self.distance_metric = distance_metric
self.margin = margin
self.model = model
self.size_average = size_average
def get_config_dict(self):
distance_metric_name = self.distance_metric.__name__
for name, value in vars(SiameseDistanceMetric).items():
if value == self.distance_metric:
distance_metric_name = "SiameseDistanceMetric.{}".format(name)
break
return {"distance_metric": distance_metric_name, "margin": self.margin, "size_average": self.size_average}
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
assert len(reps) == 2
rep_anchor, rep_other = reps
distances = self.distance_metric(rep_anchor, rep_other)
losses = 0.5 * (
labels.float() * distances.pow(2) + (1 - labels).float() * F.relu(self.margin - distances).pow(2)
)
return losses.mean() if self.size_average else losses.sum()
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from ..SentenceTransformer import SentenceTransformer
from .. import util
import copy
import random
import math
from .. import InputExample
import numpy as np
class ContrastiveTensionLoss(nn.Module):
"""
This loss expects only single sentences, without any labels. Positive and negative pairs are automatically created via random sampling,
such that a positive pair consists of two identical sentences and a negative pair consists of two different sentences. An independent
copy of the encoder model is created, which is used for encoding the first sentence of each pair. The original encoder model encodes the
second sentence. The embeddings are compared and scored using the generated labels (1 if positive, 0 if negative) using the binary cross
entropy objective.
Note that you must use the `ContrastiveTensionDataLoader` for this loss. The `pos_neg_ratio` of the ContrastiveTensionDataLoader can be
used to determine the number of negative pairs per positive pair.
Generally, :class:`ContrastiveTensionLossInBatchNegatives` is recommended over this loss, as it gives a stronger training signal.
:param model: SentenceTransformer model
References:
* Semantic Re-Tuning with Contrastive Tension: https://openreview.net/pdf?id=Ov_sMNau-PF
* `Unsupervised Learning > CT <../../examples/unsupervised_learning/CT/README.html>`_
Relations:
* :class:`ContrastiveTensionLossInBatchNegatives` uses in-batch negative sampling, which gives a stronger training signal than this loss.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | none |
+------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.losses import ContrastiveTensionDataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
'This is the 1st sentence',
'This is the 2nd sentence',
'This is the 3rd sentence',
'This is the 4th sentence',
'This is the 5th sentence',
'This is the 6th sentence',
'This is the 7th sentence',
'This is the 8th sentence',
'This is the 9th sentence',
'This is the final sentence',
]
train_dataloader = ContrastiveTensionDataLoader(train_examples, batch_size=3, pos_neg_ratio=3)
train_loss = losses.ContrastiveTensionLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
def __init__(self, model: SentenceTransformer):
super(ContrastiveTensionLoss, self).__init__()
self.model2 = model # This will be the final model used during the inference time.
self.model1 = copy.deepcopy(model)
self.criterion = nn.BCEWithLogitsLoss(reduction="sum")
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
sentence_features1, sentence_features2 = tuple(sentence_features)
reps_1 = self.model1(sentence_features1)["sentence_embedding"] # (bsz, hdim)
reps_2 = self.model2(sentence_features2)["sentence_embedding"]
sim_scores = (
torch.matmul(reps_1[:, None], reps_2[:, :, None]).squeeze(-1).squeeze(-1)
) # (bsz,) dot product, i.e. S1S2^T
loss = self.criterion(sim_scores, labels.type_as(sim_scores))
return loss
class ContrastiveTensionLossInBatchNegatives(nn.Module):
def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct=util.cos_sim):
"""
This loss expects only single sentences, without any labels. Positive and negative pairs are automatically created via random sampling,
such that a positive pair consists of two identical sentences and a negative pair consists of two different sentences. An independent
copy of the encoder model is created, which is used for encoding the first sentence of each pair. The original encoder model encodes the
second sentence. Unlike :class:`ContrastiveTensionLoss`, this loss uses the batch negative sampling strategy, i.e. the negative pairs
are sampled from the batch. Using in-batch negative sampling gives a stronger training signal than the original :class:`ContrastiveTensionLoss`.
The performance usually increases with increasing batch sizes.
Note that you should not use the `ContrastiveTensionDataLoader` for this loss, but just a normal DataLoader with `InputExample` instances.
The two texts of each `InputExample` instance should be identical.
:param model: SentenceTransformer model
:param scale: Output of similarity function is multiplied by scale value
:param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
References:
- Semantic Re-Tuning with Contrastive Tension: https://openreview.net/pdf?id=Ov_sMNau-PF
- `Unsupervised Learning > CT (In-Batch Negatives) <../../examples/unsupervised_learning/CT_In-Batch_Negatives/README.html>`_
Relations:
* :class:`ContrastiveTensionLoss` does not select negative pairs in-batch, resulting in a weaker training signal than this loss.
Inputs:
+------------------------+--------+
| Texts | Labels |
+========================+========+
| (anchor, anchor) pairs | none |
+------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0),
]
train_examples = [
InputExample(texts=['This is the 1st sentence', 'This is the 1st sentence']),
InputExample(texts=['This is the 2nd sentence', 'This is the 2nd sentence']),
InputExample(texts=['This is the 3rd sentence', 'This is the 3rd sentence']),
InputExample(texts=['This is the 4th sentence', 'This is the 4th sentence']),
InputExample(texts=['This is the 5th sentence', 'This is the 5th sentence']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.ContrastiveTensionLossInBatchNegatives(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(ContrastiveTensionLossInBatchNegatives, self).__init__()
self.model2 = model # This will be the final model used during the inference time.
self.model1 = copy.deepcopy(model)
self.similarity_fct = similarity_fct
self.cross_entropy_loss = nn.CrossEntropyLoss()
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(scale))
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
sentence_features1, sentence_features2 = tuple(sentence_features)
embeddings_a = self.model1(sentence_features1)["sentence_embedding"] # (bsz, hdim)
embeddings_b = self.model2(sentence_features2)["sentence_embedding"]
scores = self.similarity_fct(embeddings_a, embeddings_b) * self.logit_scale.exp() # self.scale
labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)
return (self.cross_entropy_loss(scores, labels) + self.cross_entropy_loss(scores.t(), labels)) / 2
################# CT Data Loader #################
# For CT, we need batches in a specific format
# In each batch, we have one positive pair (i.e. [sentA, sentA]) and 7 negative pairs (i.e. [sentA, sentB]).
# To achieve this, we create a custom DataLoader that produces batches with this property
class ContrastiveTensionDataLoader:
def __init__(self, sentences, batch_size, pos_neg_ratio=8):
self.sentences = sentences
self.batch_size = batch_size
self.pos_neg_ratio = pos_neg_ratio
self.collate_fn = None
if self.batch_size % self.pos_neg_ratio != 0:
raise ValueError(
f"ContrastiveTensionDataLoader was loaded with a pos_neg_ratio of {pos_neg_ratio} and a batch size of {batch_size}. The batch size must be divisible by the pos_neg_ratio"
)
def __iter__(self):
random.shuffle(self.sentences)
sentence_idx = 0
batch = []
while sentence_idx + 1 < len(self.sentences):
s1 = self.sentences[sentence_idx]
if len(batch) % self.pos_neg_ratio > 0: # Negative (different) pair
sentence_idx += 1
s2 = self.sentences[sentence_idx]
label = 0
else: # Positive (identical pair)
s2 = self.sentences[sentence_idx]
label = 1
sentence_idx += 1
batch.append(InputExample(texts=[s1, s2], label=label))
if len(batch) >= self.batch_size:
yield self.collate_fn(batch) if self.collate_fn is not None else batch
batch = []
def __len__(self):
return math.floor(len(self.sentences) / (2 * self.batch_size))
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from ..SentenceTransformer import SentenceTransformer
class CosineSimilarityLoss(nn.Module):
def __init__(self, model: SentenceTransformer, loss_fct=nn.MSELoss(), cos_score_transformation=nn.Identity()):
"""
CosineSimilarityLoss expects that the InputExamples consists of two texts and a float label. It computes the
vectors ``u = model(sentence_A)`` and ``v = model(sentence_B)`` and measures the cosine-similarity between the two.
By default, it minimizes the following loss: ``||input_label - cos_score_transformation(cosine_sim(u,v))||_2``.
:param model: SentenceTransformer model
:param loss_fct: Which pytorch loss function should be used to compare the ``cosine_similarity(u, v)`` with the input_label?
By default, MSE is used: ``||input_label - cosine_sim(u, v)||_2``
:param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity.
By default, the identify function is used (i.e. no change).
References:
- `Training Examples > Semantic Textual Similarity <../../examples/training/sts/README.html>`_
Requirements:
1. Sentence pairs with corresponding similarity scores in range `[0, 1]`
Relations:
- :class:`CoSENTLoss` seems to produce a stronger training signal than CosineSimilarityLoss. In our experiments, CoSENTLoss is recommended.
- :class:`AnglELoss` is :class:`CoSENTLoss` with ``pairwise_angle_sim`` as the metric, rather than ``pairwise_cos_sim``. It also produces a stronger training signal than CosineSimilarityLoss.
Inputs:
+--------------------------------+------------------------+
| Texts | Labels |
+================================+========================+
| (sentence_A, sentence_B) pairs | float similarity score |
+--------------------------------+------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [
InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)
]
train_batch_size = 1
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(CosineSimilarityLoss, self).__init__()
self.model = model
self.loss_fct = loss_fct
self.cos_score_transformation = cos_score_transformation
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
return self.loss_fct(output, labels.view(-1))
from torch import nn, Tensor
from typing import Iterable, Dict
from sentence_transformers import SentenceTransformer
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, PreTrainedModel
import logging
logger = logging.getLogger(__name__)
class DenoisingAutoEncoderLoss(nn.Module):
def __init__(self, model: SentenceTransformer, decoder_name_or_path: str = None, tie_encoder_decoder: bool = True):
"""
This loss expects as input a pairs of damaged sentences and the corresponding original ones.
During training, the decoder reconstructs the original sentences from the encoded sentence embeddings.
Here the argument 'decoder_name_or_path' indicates the pretrained model (supported by Hugging Face) to be used as the decoder.
Since decoding process is included, here the decoder should have a class called XXXLMHead (in the context of Hugging Face's Transformers).
The 'tie_encoder_decoder' flag indicates whether to tie the trainable parameters of encoder and decoder,
which is shown beneficial to model performance while limiting the amount of required memory.
Only when the encoder and decoder are from the same architecture, can the flag 'tie_encoder_decoder' work.
The data generation process (i.e. the 'damaging' process) has already been implemented in ``DenoisingAutoEncoderDataset``,
allowing you to only provide regular sentences.
:param model: SentenceTransformer model
:param decoder_name_or_path: Model name or path for initializing a decoder (compatible with Huggingface's Transformers)
:param tie_encoder_decoder: whether to tie the trainable parameters of encoder and decoder
References:
* TSDAE paper: https://arxiv.org/pdf/2104.06979.pdf
* `Unsupervised Learning > TSDAE <../../examples/unsupervised_learning/TSDAE/README.html>`_
Requirements:
1. The decoder should have a class called XXXLMHead (in the context of Hugging Face's Transformers)
2. Should use a large corpus
Inputs:
+------------------------------------------------------+--------+
| Texts | Labels |
+======================================================+========+
| (damaged\_sentence, original\_sentence) pairs | none |
+------------------------------------------------------+--------+
| sentence fed through ``DenoisingAutoEncoderDataset`` | none |
+------------------------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from torch.utils.data import DataLoader
model_name = "bert-base-cased"
model = SentenceTransformer(model_name)
train_sentences = [
"First training sentence", "Second training sentence", "Third training sentence", "Fourth training sentence",
]
batch_size = 2
train_dataset = DenoisingAutoEncoderDataset(train_sentences)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
train_loss = losses.DenoisingAutoEncoderLoss(
model, decoder_name_or_path=model_name, tie_encoder_decoder=True
)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(DenoisingAutoEncoderLoss, self).__init__()
self.encoder = model # This will be the final model used during the inference time.
self.tokenizer_encoder = model.tokenizer
encoder_name_or_path = model[0].auto_model.config._name_or_path
if decoder_name_or_path is None:
assert (
tie_encoder_decoder
), "Must indicate the decoder_name_or_path argument when tie_encoder_decoder=False!"
if tie_encoder_decoder:
if decoder_name_or_path:
logger.warning("When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.")
decoder_name_or_path = encoder_name_or_path
self.tokenizer_decoder = AutoTokenizer.from_pretrained(decoder_name_or_path)
self.need_retokenization = not isinstance(self.tokenizer_encoder, type(self.tokenizer_decoder))
decoder_config = AutoConfig.from_pretrained(decoder_name_or_path)
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
kwargs_decoder = {"config": decoder_config}
try:
self.decoder = AutoModelForCausalLM.from_pretrained(decoder_name_or_path, **kwargs_decoder)
except ValueError as e:
logger.error(
f'Model name or path "{decoder_name_or_path}" does not support being as a decoder. Please make sure the decoder model has an "XXXLMHead" class.'
)
raise e
assert model[0].auto_model.config.hidden_size == decoder_config.hidden_size, "Hidden sizes do not match!"
if self.tokenizer_decoder.pad_token is None:
# Needed by GPT-2, etc.
self.tokenizer_decoder.pad_token = self.tokenizer_decoder.eos_token
self.decoder.config.pad_token_id = self.decoder.config.eos_token_id
if len(AutoTokenizer.from_pretrained(encoder_name_or_path)) != len(self.tokenizer_encoder):
logger.warning(
"WARNING: The vocabulary of the encoder has been changed. One might need to change the decoder vocabulary, too."
)
if tie_encoder_decoder:
assert not self.need_retokenization, "The tokenizers should be the same when tie_encoder_decoder=True."
if len(self.tokenizer_encoder) != len(self.tokenizer_decoder): # The vocabulary has been changed.
self.tokenizer_decoder = self.tokenizer_encoder
self.decoder.resize_token_embeddings(len(self.tokenizer_decoder))
logger.warning(
"Since the encoder vocabulary has been changed and --tie_encoder_decoder=True, now the new vocabulary has also been used for the decoder."
)
decoder_base_model_prefix = self.decoder.base_model_prefix
PreTrainedModel._tie_encoder_decoder_weights(
model[0].auto_model, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
)
def retokenize(self, sentence_features):
input_ids = sentence_features["input_ids"]
device = input_ids.device
sentences_decoded = self.tokenizer_encoder.batch_decode(
input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
retokenized = self.tokenizer_decoder(
sentences_decoded, padding=True, truncation="longest_first", return_tensors="pt", max_length=None
).to(device)
return retokenized
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
source_features, target_features = tuple(sentence_features)
if self.need_retokenization:
# since the sentence_features here are all tokenized by encoder's tokenizer,
# retokenization by the decoder's one is needed if different tokenizers used
target_features = self.retokenize(target_features)
reps = self.encoder(source_features)["sentence_embedding"] # (bsz, hdim)
# Prepare input and output
target_length = target_features["input_ids"].shape[1]
decoder_input_ids = target_features["input_ids"].clone()[:, : target_length - 1]
label_ids = target_features["input_ids"][:, 1:]
# Decode
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
inputs_embeds=None,
attention_mask=None,
encoder_hidden_states=reps[:, None], # (bsz, hdim) -> (bsz, 1, hdim)
encoder_attention_mask=source_features["attention_mask"][:, 0:1],
labels=None,
return_dict=None,
use_cache=False,
)
# Calculate loss
lm_logits = decoder_outputs[0]
ce_loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer_decoder.pad_token_id)
loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), label_ids.reshape(-1))
return loss
from typing import Any, Iterable, Dict
import torch
from torch import nn, Tensor
from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers.models import Transformer
class GISTEmbedLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
guide: SentenceTransformer,
temperature: float = 0.01,
):
"""
This loss is used to train a SentenceTransformer model using the GISTEmbed algorithm.
It takes a model and a guide model as input, and uses the guide model to guide the
in-batch negative sample selection. The cosine similarity is used to compute the loss
and the temperature parameter is used to scale the cosine similarities.
:param model: SentenceTransformer model based on a `transformers` model.
:param guide: SentenceTransformer model to guide the in-batch negative sample selection.
:param temperature: Temperature parameter to scale the cosine similarities.
References:
- For further details, see: https://arxiv.org/abs/2402.16829
Requirements:
1. (anchor, positive, negative) triplets
2. (anchor, positive) pairs
Relations:
- :class:`MultipleNegativesRankingLoss` is similar to this loss, but it does not use
a guide model to guide the in-batch negative sample selection. `GISTEmbedLoss` yields
a stronger training signal at the cost of some training overhead.
Inputs:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| (anchor, positive, negative) triplets | none |
+---------------------------------------+--------+
| (anchor, positive) pairs | none |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
guide = SentenceTransformer('avsolatorio/GIST-small-Embedding-v0')
train_examples = [
InputExample(texts=['The first query', 'The first positive passage', 'The first negative passage']),
InputExample(texts=['The second query', 'The second positive passage', 'The second negative passage']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.GISTEmbedLoss(model=model, guide=guide)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(GISTEmbedLoss, self).__init__()
self.model = model
self.guide = guide
self.temperature = temperature
self.similarity_fct = nn.CosineSimilarity(dim=-1)
if not isinstance(model[0], Transformer) or not isinstance(guide[0], Transformer):
raise ValueError(
"Both the training model and the guiding model must be based on the `transformers` architecture."
)
self.must_retokenize = (
model.tokenizer.vocab != guide.tokenizer.vocab or guide.max_seq_length < model.max_seq_length
)
def sim_matrix(self, embed1, embed2):
return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0))
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
with torch.no_grad():
if self.must_retokenize:
decoded = [
self.model.tokenizer.batch_decode(sentence_feature["input_ids"], skip_special_tokens=True)
for sentence_feature in sentence_features
]
sentence_features = [self.guide.tokenize(sentences) for sentences in decoded]
sentence_features = [
{key: value.to(self.guide.device) for key, value in sentence_feature.items()}
for sentence_feature in sentence_features
]
guide_embeddings = [
self.guide(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features
]
negative = None
negative_guide = None
if len(embeddings) == 2:
anchor, positive = embeddings
anchor_guide, positive_guide = guide_embeddings
elif len(embeddings) == 3:
anchor, positive, negative = embeddings
anchor_guide, positive_guide, negative_guide = guide_embeddings
else:
raise ValueError("Expected 2 or 3 embeddings, got {}".format(len(embeddings)))
# Compute the model's similarities
ap_sim = self.sim_matrix(anchor, positive)
aa_sim = self.sim_matrix(anchor, anchor)
pp_sim = self.sim_matrix(positive, positive)
# Let's compute the similarity matrices for the combinations of anchor and positive samples.
guided_ap_sim = self.sim_matrix(anchor_guide, positive_guide)
guided_aa_sim = self.sim_matrix(anchor_guide, anchor_guide)
guided_pp_sim = self.sim_matrix(positive_guide, positive_guide)
# Define the anchor threshold
guided_sim = guided_ap_sim.diagonal().view(-1, 1)
# Find which samples cannot be used as negatives because they are
# more similar to the query than the assigned positive as deemed by the guide model.
# For these samples, we mask them with -inf to basically ignore their contribution to
# the loss.
ap_sim[guided_ap_sim > guided_sim] = -torch.inf
aa_sim[guided_aa_sim > guided_sim] = -torch.inf
pp_sim[guided_pp_sim > guided_sim] = -torch.inf
scores = [ap_sim, aa_sim, pp_sim]
# Handle the case where we have a negative sample
if negative is not None:
an_sim = self.sim_matrix(anchor, negative)
guided_an_sim = self.sim_matrix(anchor_guide, negative_guide)
an_sim[guided_an_sim > guided_sim] = -torch.inf
scores.append(an_sim)
scores = torch.cat(scores, dim=1) / self.temperature
# NOTE: We use arange here since the ap_sim matrix contains the anchor-positive
# similarities along the diagonal.
labels = torch.arange(scores.size(0)).long().to(scores.device)
return nn.CrossEntropyLoss()(scores, labels)
def get_config_dict(self) -> Dict[str, Any]:
return {
"guide": self.guide,
"temperature": self.temperature,
}
from torch import nn, Tensor
from typing import Iterable, Dict
class MSELoss(nn.Module):
def __init__(self, model):
"""
Computes the MSE loss between the computed sentence embedding and a target sentence embedding. This loss
is used when extending sentence embeddings to new languages as described in our publication
Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation.
For an example, see `the distillation documentation <../../examples/training/distillation/README.html>`_ on extending language models to new languages.
:param model: SentenceTransformerModel
References:
- Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation: https://arxiv.org/abs/2004.09813
- `Training > Model Distillation <../../examples/training/distillation/README.html>`_
- `Training > Multilingual Models <../../examples/training/multilingual/README.html>`_
Requirements:
1. Usually uses a finetuned teacher M in a knowledge distillation setup
Relations:
- :class:`MarginMSELoss` is equivalent to this loss, but with a margin through a negative pair.
Input:
+-------------------+-----------------------------+
| Texts | Labels |
+===================+=============================+
| single sentences | model sentence embeddings |
+-------------------+-----------------------------+
Example::
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
model_en = SentenceTransformer('bert-base-cased')
model_fr = SentenceTransformer('flaubert/flaubert_base_cased')
examples_en = ['The first sentence', 'The second sentence', 'The third sentence', 'The fourth sentence']
examples_fr = ['La première phrase', 'La deuxième phrase', 'La troisième phrase', 'La quatrième phrase']
train_batch_size = 2
labels_en_en = model_en.encode(examples_en)
examples_en_fr = [InputExample(texts=[x], label=labels_en_en[i]) for i, x in enumerate(examples_en)]
loader_en_fr = DataLoader(examples_en_fr, batch_size=train_batch_size)
examples_fr_fr = [InputExample(texts=[x], label=labels_en_en[i]) for i, x in enumerate(examples_fr)]
loader_fr_fr = DataLoader(examples_fr_fr, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model_fr)
model_fr.fit(
[(loader_en_fr, train_loss), (loader_fr_fr, train_loss)],
epochs=10,
)
"""
super(MSELoss, self).__init__()
self.model = model
self.loss_fct = nn.MSELoss()
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
rep = self.model(sentence_features[0])["sentence_embedding"]
return self.loss_fct(rep, labels)
from .. import util
from torch import nn, Tensor
from typing import Iterable, Dict
class MarginMSELoss(nn.Module):
def __init__(self, model, similarity_fct=util.pairwise_dot_score):
"""
Compute the MSE loss between the ``|sim(Query, Pos) - sim(Query, Neg)|`` and ``|gold_sim(Query, Pos) - gold_sim(Query, Neg)|``.
By default, sim() is the dot-product. The gold_sim is often the similarity score from a teacher model.
In contrast to :class:`MultipleNegativesRankingLoss`, the two passages do not have to be strictly positive and negative,
both can be relevant or not relevant for a given query. This can be an advantage of MarginMSELoss over
MultipleNegativesRankingLoss, but note that the MarginMSELoss is much slower to train. With MultipleNegativesRankingLoss,
with a batch size of 64, we compare one query against 128 passages. With MarginMSELoss, we compare a query only
against two passages.
:param model: SentenceTransformerModel
:param similarity_fct: Which similarity function to use.
References:
- For more details, please refer to https://arxiv.org/abs/2010.02666.
- `Training Examples > MS MARCO <../../examples/training/ms_marco/README.html>`_
- `Unsupervised Learning > Domain Adaptation <../../examples/domain_adaptation/README.html>`_
Requirements:
1. (query, passage_one, passage_two) triplets
2. Usually used with a finetuned teacher M in a knowledge distillation setup
Relations:
- :class:`MSELoss` is equivalent to this loss, but without a margin through the negative pair.
Inputs:
+-----------------------------------------------+-----------------------------------------------+
| Texts | Labels |
+===============================================+===============================================+
| (query, passage_one, passage_two) triplets | M(query, passage_one) - M(query, passage_two) |
+-----------------------------------------------+-----------------------------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.util import pairwise_dot_score
from torch.utils.data import DataLoader
import torch
student_model = SentenceTransformer('sentence-transformers/distilbert-base-nli-mean-tokens')
teacher_model = SentenceTransformer('sentence-transformers/bert-base-nli-stsb-mean-tokens')
train_examples = [
['The first query', 'The first positive passage', 'The first negative passage'],
['The second query', 'The second positive passage', 'The second negative passage'],
['The third query', 'The third positive passage', 'The third negative passage'],
]
train_batch_size = 1
encoded = torch.tensor([teacher_model.encode(x).tolist() for x in train_examples])
labels = pairwise_dot_score(encoded[:, 0], encoded[:, 1]) - pairwise_dot_score(encoded[:, 0], encoded[:, 2])
train_input_examples = [InputExample(texts=x, label=labels[i]) for i, x in enumerate(train_examples)]
train_dataloader = DataLoader(train_input_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MarginMSELoss(model=student_model)
student_model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(MarginMSELoss, self).__init__()
self.model = model
self.similarity_fct = similarity_fct
self.loss_fct = nn.MSELoss()
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
# sentence_features: query, positive passage, negative passage
reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
embeddings_query = reps[0]
embeddings_pos = reps[1]
embeddings_neg = reps[2]
scores_pos = self.similarity_fct(embeddings_query, embeddings_pos)
scores_neg = self.similarity_fct(embeddings_query, embeddings_neg)
margin_pred = scores_pos - scores_neg
return self.loss_fct(margin_pred, labels)
from typing import Any, Dict, List, Optional, Union
from torch.nn import Module
from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers.losses import AdaptiveLayerLoss, MatryoshkaLoss
class Matryoshka2dLoss(AdaptiveLayerLoss):
def __init__(
self,
model: SentenceTransformer,
loss: Module,
matryoshka_dims: List[int],
matryoshka_weights: Optional[List[Union[float, int]]] = None,
n_layers_per_step: int = 1,
n_dims_per_step: int = 1,
last_layer_weight: float = 1.0,
prior_layers_weight: float = 1.0,
kl_div_weight: float = 1.0,
kl_temperature: float = 0.3,
) -> None:
"""
The Matryoshka2dLoss can be seen as a loss *modifier* that combines the :class:`AdaptiveLayerLoss` and the
:class:`MatryoshkaLoss`. This allows you to train an embedding model that 1) allows users to specify the number
of model layers to use, and 2) allows users to specify the output dimensions to use.
The former is useful for when you want users to have the option to lower the number of layers used to improve
their inference speed and memory usage, and the latter is useful for when you want users to have the option to
lower the output dimensions to improve the efficiency of their downstream tasks (e.g. retrieval) or to lower
their storage costs.
Note, this uses `n_layers_per_step=1` and `n_dims_per_step=1` as default, following the original 2DMSE
implementation.
:param model: SentenceTransformer model
:param loss: The loss function to be used, e.g. :class:`MultipleNegativesRankingLoss`, :class:`CoSENTLoss`, etc.
:param matryoshka_dims: A list of embedding dimensions to be used for the loss function, e.g. [768, 512, 256, 128, 64].
:param matryoshka_weights: A list of weights to be used for the loss function, e.g. [1, 1, 1, 1, 1]. If None, then the
weights will be set to 1 for all dimensions.
:param n_layers_per_step: The number of layers to use per step. If -1, then all layers are used. If > 0, then
a random sample of n_layers_per_step layers are used per step. The 2DMSE paper uses `n_layers_per_step=1`.
The default value is -1.
:param n_dims_per_step: The number of dimensions to use per step. If -1, then all dimensions are used. If > 0, then
a random sample of n_dims_per_step dimensions are used per step. The default value is -1.
:param last_layer_weight: The weight to use for the loss of the final layer. Increase this to focus more on the
performance when using all layers. The default value is 1.0.
:param prior_layers_weight: The weight to use for the loss of the prior layers. Increase this to focus more on
the performance when using fewer layers. The default value is 1.0.
:param kl_div_weight: The weight to use for the KL-divergence loss that is used to make the prior layers match
that of the last layer. Increase this to focus more on the performance when using fewer layers. The default
value is 1.0.
:param kl_temperature: The temperature to use for the KL-divergence loss. If 0, then the KL-divergence loss is
not used. The default value is 1.0.
References:
- See the 2D Matryoshka Sentence Embeddings (2DMSE) paper: https://arxiv.org/abs/2402.14776
- `Matryoshka Embeddings <../../examples/training/matryoshka/README.html>`_
- `Adaptive Layers <../../examples/training/adaptive_layer/README.html>`_
Requirements:
1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss`.
Relations:
- :class:`MatryoshkaLoss` is used in this loss, and it is responsible for the dimensionality reduction.
- :class:`AdaptiveLayerLoss` is used in this loss, and it is responsible for the layer reduction.
Input:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| any | any |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('microsoft/mpnet-base')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
matryoshka_loss = MatryoshkaLoss(
model,
loss,
matryoshka_dims,
matryoshka_weights=matryoshka_weights,
n_dims_per_step=n_dims_per_step,
)
super().__init__(
model,
matryoshka_loss,
n_layers_per_step=n_layers_per_step,
last_layer_weight=last_layer_weight,
prior_layers_weight=prior_layers_weight,
kl_div_weight=kl_div_weight,
kl_temperature=kl_temperature,
)
def get_config_dict(self) -> Dict[str, Any]:
return {
**super().get_config_dict(),
**self.loss.get_config_dict(),
}
import random
from typing import Any, Dict, Iterable, List, Optional, Union
import warnings
from torch import Tensor, nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import CachedMultipleNegativesRankingLoss
class ForwardDecorator:
def __init__(self, fn):
self.fn = fn
self.dim = None
self.cache = []
self.cache_dim = None
self.idx = 0
def set_dim(self, dim):
self.dim = dim
self.idx = 0
def shrink(self, tensor: Tensor) -> Tensor:
tensor = tensor[..., : self.dim]
tensor = F.normalize(tensor, p=2, dim=-1)
return tensor
def __call__(self, features):
# Growing cache:
if self.cache_dim is None or self.cache_dim == self.dim:
output = self.fn(features)
self.cache.append(output)
self.cache_dim = self.dim
# Using cache:
else:
output = self.cache[self.idx]
output["token_embeddings"] = self.shrink(output["token_embeddings"])
output["sentence_embedding"] = self.shrink(output["sentence_embedding"])
self.idx += 1
return output
class MatryoshkaLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
loss: nn.Module,
matryoshka_dims: List[int],
matryoshka_weights: Optional[List[Union[float, int]]] = None,
n_dims_per_step: int = -1,
) -> None:
"""
The MatryoshkaLoss can be seen as a loss *modifier* that allows you to use other loss functions at various
different embedding dimensions. This is useful for when you want to train a model where users have the option
to lower the embedding dimension to improve their embedding comparison speed and costs.
:param model: SentenceTransformer model
:param loss: The loss function to be used, e.g. :class:`MultipleNegativesRankingLoss`, :class:`CoSENTLoss`, etc.
:param matryoshka_dims: A list of embedding dimensions to be used for the loss function, e.g. [768, 512, 256, 128, 64].
:param matryoshka_weights: A list of weights to be used for the loss function, e.g. [1, 1, 1, 1, 1]. If None, then the
weights will be set to 1 for all dimensions.
:param n_dims_per_step: The number of dimensions to use per step. If -1, then all dimensions are used. If > 0, then
a random sample of n_dims_per_step dimensions are used per step. The default value is -1.
References:
- The concept was introduced in this paper: https://arxiv.org/abs/2205.13147
- `Matryoshka Embeddings <../../examples/training/matryoshka/README.html>`_
Requirements:
1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss`.
Relations:
- :class:`Matryoshka2dLoss` uses this loss in combination with :class:`AdaptiveLayerLoss` which allows for
layer reduction for faster inference.
Input:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| any | any |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('microsoft/mpnet-base')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super().__init__()
self.model = model
self.loss = loss
if isinstance(loss, CachedMultipleNegativesRankingLoss):
warnings.warn("MatryoshkaLoss is not compatible with CachedMultipleNegativesRankingLoss.", stacklevel=2)
self.matryoshka_dims = matryoshka_dims
if matryoshka_weights is None:
matryoshka_weights = [1] * len(matryoshka_dims)
self.matryoshka_weights = matryoshka_weights
self.n_dims_per_step = n_dims_per_step
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor:
original_forward = self.model.forward
decorated_forward = ForwardDecorator(original_forward)
self.model.forward = decorated_forward
dim_indices = range(len(self.matryoshka_dims))
if self.n_dims_per_step > 0 and self.n_dims_per_step < len(dim_indices):
dim_indices = random.sample(dim_indices, self.n_dims_per_step)
loss = 0.0
for idx in dim_indices:
dim = self.matryoshka_dims[idx]
weight = self.matryoshka_weights[idx]
decorated_forward.set_dim(dim)
loss += weight * self.loss(sentence_features, labels)
self.model.forward = original_forward
return loss
def get_config_dict(self) -> Dict[str, Any]:
return {
"loss": self.loss.__class__.__name__,
"matryoshka_dims": self.matryoshka_dims,
"matryoshka_weights": self.matryoshka_weights,
"n_dims_per_step": self.n_dims_per_step,
}
from .. import util
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
import torch.nn.functional as F
class MegaBatchMarginLoss(nn.Module):
def __init__(
self,
model,
positive_margin: float = 0.8,
negative_margin: float = 0.3,
use_mini_batched_version: bool = True,
mini_batch_size: int = 50,
):
"""
Given a large batch (like 500 or more examples) of (anchor_i, positive_i) pairs, find for each pair in the batch
the hardest negative, i.e. find j != i such that cos_sim(anchor_i, positive_j) is maximal. Then create from this a
triplet (anchor_i, positive_i, positive_j) where positive_j serves as the negative for this triplet.
Then train as with the triplet loss.
:param model: SentenceTransformerModel
:param positive_margin: Positive margin, cos(anchor, positive) should be > positive_margin
:param negative_margin: Negative margin, cos(anchor, negative) should be < negative_margin
:param use_mini_batched_version: As large batch sizes require a lot of memory, we can use a mini-batched version.
We break down the large batch into smaller batches with fewer examples.
:param mini_batch_size: Size for the mini-batches. Should be a devisor for the batch size in your data loader.
References:
- This loss function was inspired by the ParaNMT paper: https://www.aclweb.org/anthology/P18-1042/
Requirements:
1. (anchor, positive) pairs
2. Large batches (500 or more examples)
Input:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| (anchor, positive) pairs | none |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
total_examples = 500
train_batch_size = 250
train_mini_batch_size = 32
train_examples = [
InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples)
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(MegaBatchMarginLoss, self).__init__()
self.model = model
self.positive_margin = positive_margin
self.negative_margin = negative_margin
self.mini_batch_size = mini_batch_size
self.forward = self.forward_mini_batched if use_mini_batched_version else self.forward_non_mini_batched
def forward_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
anchor, positive = sentence_features
feature_names = list(anchor.keys())
with torch.no_grad():
self.model.eval()
all_positive_emb = self.model(positive)["sentence_embedding"].detach()
self.model.train()
diagonal_matrix = torch.eye(len(all_positive_emb), len(all_positive_emb), device=all_positive_emb.device)
# Iterate over the triplets (anchor, positive, hardest_negative) in smaller mini_batch sizes
for start_idx in range(0, len(all_positive_emb), self.mini_batch_size):
end_idx = start_idx + self.mini_batch_size
anchor_emb = self.model({key: anchor[key][start_idx:end_idx] for key in feature_names})[
"sentence_embedding"
]
# Find hard negatives. For each anchor, find the hardest negative
# Store them in the triplets (anchor, positive, hardest_negative)
hard_negative_features = {key: [] for key in feature_names}
with torch.no_grad():
cos_scores = util.pytorch_cos_sim(anchor_emb, all_positive_emb)
negative_scores = (
cos_scores - 2 * diagonal_matrix[start_idx:end_idx]
) # Remove positive scores along the diagonal, set them to -1 so that they are not selected by the max() operation
negatives_max, negatives_ids = torch.max(negative_scores, dim=1)
for hard_negative_id in negatives_ids:
for key in feature_names:
hard_negative_features[key].append(positive[key][hard_negative_id])
for key in feature_names:
hard_negative_features[key] = torch.stack(hard_negative_features[key])
# Compute differentiable negative and positive embeddings
positive_emb = self.model({key: positive[key][start_idx:end_idx] for key in feature_names})[
"sentence_embedding"
]
negative_emb = self.model(hard_negative_features)["sentence_embedding"]
assert anchor_emb.shape == positive_emb.shape
assert anchor_emb.shape == negative_emb.shape
# Compute loss
pos_cosine = F.cosine_similarity(anchor_emb, positive_emb)
neg_cosine = F.cosine_similarity(anchor_emb, negative_emb)
losses = F.relu(self.positive_margin - pos_cosine) + F.relu(neg_cosine - self.negative_margin)
losses = losses.mean()
# Backpropagate unless it is the last mini batch. The last mini-batch will be back propagated by the outside train loop
if end_idx < len(cos_scores):
losses.backward()
return losses
##### Non mini-batched version ###
def forward_non_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
embeddings_a, embeddings_b = reps
cos_scores = util.pytorch_cos_sim(embeddings_a, embeddings_b)
positive_scores = torch.diagonal(cos_scores)
negative_scores = cos_scores - (
2 * torch.eye(*cos_scores.shape, device=cos_scores.device)
) # Remove positive scores along the diagonal
negatives_max, _ = torch.max(negative_scores, dim=1)
losses = F.relu(self.positive_margin - positive_scores) + F.relu(negatives_max - self.negative_margin)
return losses.mean()
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from ..SentenceTransformer import SentenceTransformer
from .. import util
class MultipleNegativesRankingLoss(nn.Module):
def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct=util.cos_sim):
"""
This loss expects as input a batch consisting of sentence pairs ``(a_1, p_1), (a_2, p_2)..., (a_n, p_n)``
where we assume that ``(a_i, p_i)`` are a positive pair and ``(a_i, p_j)`` for ``i != j`` a negative pair.
For each ``a_i``, it uses all other ``p_j`` as negative samples, i.e., for ``a_i``, we have 1 positive example
(``p_i``) and ``n-1`` negative examples (``p_j``). It then minimizes the negative log-likehood for softmax
normalized scores.
This loss function works great to train embeddings for retrieval setups where you have positive pairs
(e.g. (query, relevant_doc)) as it will sample in each batch ``n-1`` negative docs randomly.
The performance usually increases with increasing batch sizes.
You can also provide one or multiple hard negatives per anchor-positive pair by structering the data like this:
``(a_1, p_1, n_1), (a_2, p_2, n_2)``. Then, ``n_1`` is a hard negative for ``(a_1, p_1)``. The loss will use for
the pair ``(a_i, p_i)`` all ``p_j`` for ``j != i`` and all ``n_j`` as negatives.
:param model: SentenceTransformer model
:param scale: Output of similarity function is multiplied by scale value
:param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
References:
- Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4: https://arxiv.org/pdf/1705.00652.pdf
- `Training Examples > Natural Language Inference <../../examples/training/nli/README.html>`_
- `Training Examples > Paraphrase Data <../../examples/training/paraphrases/README.html>`_
- `Training Examples > Quora Duplicate Questions <../../examples/training/quora_duplicate_questions/README.html>`_
- `Training Examples > MS MARCO <../../examples/training/ms_marco/README.html>`_
- `Unsupervised Learning > SimCSE <../../examples/unsupervised_learning/SimCSE/README.html>`_
- `Unsupervised Learning > GenQ <../../examples/unsupervised_learning/query_generation/README.html>`_
Requirements:
1. (anchor, positive) pairs or (anchor, positive, negative) triplets
Relations:
- :class:`CachedMultipleNegativesRankingLoss` is equivalent to this loss, but it uses caching that allows for
much higher batch sizes (and thus better performance) without extra memory usage. However, it requires more
training time.
- :class:`MultipleNegativesSymmetricRankingLoss` is equivalent to this loss, but with an additional loss term.
- :class:`GISTEmbedLoss` is equivalent to this loss, but uses a guide model to guide the in-batch negative
sample selection. `GISTEmbedLoss` yields a stronger training signal at the cost of some training overhead.
Inputs:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| (anchor, positive) pairs | none |
+---------------------------------------+--------+
| (anchor, positive, negative) triplets | none |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-uncased')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(MultipleNegativesRankingLoss, self).__init__()
self.model = model
self.scale = scale
self.similarity_fct = similarity_fct
self.cross_entropy_loss = nn.CrossEntropyLoss()
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
embeddings_a = reps[0]
embeddings_b = torch.cat(reps[1:])
scores = self.similarity_fct(embeddings_a, embeddings_b) * self.scale
labels = torch.tensor(
range(len(scores)), dtype=torch.long, device=scores.device
) # Example a[i] should match with b[i]
return self.cross_entropy_loss(scores, labels)
def get_config_dict(self):
return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__}
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from ..SentenceTransformer import SentenceTransformer
from .. import util
class MultipleNegativesSymmetricRankingLoss(nn.Module):
def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct=util.cos_sim):
"""
This loss is an adaptation of MultipleNegativesRankingLoss. MultipleNegativesRankingLoss computes the following loss:
For a given anchor and a list of candidates, find the positive candidate.
In MultipleNegativesSymmetricRankingLoss, we add another loss term: Given the positive and a list of all anchors,
find the correct (matching) anchor.
For the example of question-answering: You have (question, answer)-pairs. MultipleNegativesRankingLoss just computes
the loss to find the answer for a given question. MultipleNegativesSymmetricRankingLoss additionally computes the
loss to find the question for a given answer.
Note: If you pass triplets, the negative entry will be ignored. A anchor is just searched for the positive.
:param model: SentenceTransformer model
:param scale: Output of similarity function is multiplied by scale value
:param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
Requirements:
1. (anchor, positive) pairs
Relations:
- Like :class:`MultipleNegativesRankingLoss`, but with an additional loss term.
Inputs:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| (anchor, positive) pairs | none |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('distilbert-base-uncased')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesSymmetricRankingLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(MultipleNegativesSymmetricRankingLoss, self).__init__()
self.model = model
self.scale = scale
self.similarity_fct = similarity_fct
self.cross_entropy_loss = nn.CrossEntropyLoss()
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
anchor = reps[0]
candidates = torch.cat(reps[1:])
scores = self.similarity_fct(anchor, candidates) * self.scale
labels = torch.tensor(
range(len(scores)), dtype=torch.long, device=scores.device
) # Example a[i] should match with b[i]
anchor_positive_scores = scores[:, 0 : len(reps[1])]
forward_loss = self.cross_entropy_loss(scores, labels)
backward_loss = self.cross_entropy_loss(anchor_positive_scores.transpose(0, 1), labels)
return (forward_loss + backward_loss) / 2
def get_config_dict(self):
return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__}
from typing import Iterable, Dict
import torch.nn.functional as F
from torch import nn, Tensor
from .ContrastiveLoss import SiameseDistanceMetric
from sentence_transformers.SentenceTransformer import SentenceTransformer
class OnlineContrastiveLoss(nn.Module):
def __init__(
self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5
):
"""
This Online Contrastive loss is similar to :class:`ConstrativeLoss`, but it selects hard positive (positives that
are far apart) and hard negative pairs (negatives that are close) and computes the loss only for these pairs.
This loss often yields better performances than ContrastiveLoss.
:param model: SentenceTransformer model
:param distance_metric: Function that returns a distance between two embeddings. The class SiameseDistanceMetric contains pre-defined metrics that can be used
:param margin: Negative samples (label == 0) should have a distance of at least the margin value.
References:
- `Training Examples > Quora Duplicate Questions <../../examples/training/quora_duplicate_questions/README.html>`_
Requirements:
1. (anchor, positive/negative) pairs
2. Data should include hard positives and hard negatives
Relations:
- :class:`ContrastiveLoss` is similar, but does not use hard positive and hard negative pairs.
:class:`OnlineContrastiveLoss` often yields better results.
Inputs:
+-----------------------------------------------+------------------------------+
| Texts | Labels |
+===============================================+==============================+
| (anchor, positive/negative) pairs | 1 if positive, 0 if negative |
+-----------------------------------------------+------------------------------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('all-MiniLM-L6-v2')
train_examples = [
InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=2)
train_loss = losses.OnlineContrastiveLoss(model=model)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super(OnlineContrastiveLoss, self).__init__()
self.model = model
self.margin = margin
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False):
embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
distance_matrix = self.distance_metric(embeddings[0], embeddings[1])
negs = distance_matrix[labels == 0]
poss = distance_matrix[labels == 1]
# select hard positive and hard negative pairs
negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
positive_loss = positive_pairs.pow(2).sum()
negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
loss = positive_loss + negative_loss
return loss
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment