Commit 24db6dab authored by Rayyyyy's avatar Rayyyyy
Browse files

first add

parents
Pipeline #850 failed with stages
in 0 seconds
import random
import math
class NoDuplicatesDataLoader:
def __init__(self, train_examples, batch_size):
"""
A special data loader to be used with MultipleNegativesRankingLoss.
The data loader ensures that there are no duplicate sentences within the same batch
"""
self.batch_size = batch_size
self.data_pointer = 0
self.collate_fn = None
self.train_examples = train_examples
random.shuffle(self.train_examples)
def __iter__(self):
for _ in range(self.__len__()):
batch = []
texts_in_batch = set()
while len(batch) < self.batch_size:
example = self.train_examples[self.data_pointer]
valid_example = True
for text in example.texts:
if text.strip().lower() in texts_in_batch:
valid_example = False
break
if valid_example:
batch.append(example)
for text in example.texts:
texts_in_batch.add(text.strip().lower())
self.data_pointer += 1
if self.data_pointer >= len(self.train_examples):
self.data_pointer = 0
random.shuffle(self.train_examples)
yield self.collate_fn(batch) if self.collate_fn is not None else batch
def __len__(self):
return math.floor(len(self.train_examples) / self.batch_size)
from torch.utils.data import Dataset
import logging
import gzip
from .. import SentenceTransformer
from ..readers import InputExample
from typing import List
import random
logger = logging.getLogger(__name__)
class ParallelSentencesDataset(Dataset):
"""
This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
sentence in different languages. For example, the file can look like this (EN\tDE\tES):
hello world hallo welt hola mundo
second sentence zweiter satz segunda oración
The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
mapped to this English sentence embedding.
When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.
teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
returns a list of sentence embeddings
"""
def __init__(
self,
student_model: SentenceTransformer,
teacher_model: SentenceTransformer,
batch_size: int = 8,
use_embedding_cache: bool = True,
):
"""
Parallel sentences dataset reader to train student model given a teacher model
:param student_model: Student sentence embedding model that should be trained
:param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file
"""
self.student_model = student_model
self.teacher_model = teacher_model
self.datasets = []
self.datasets_iterator = []
self.datasets_tokenized = []
self.dataset_indices = []
self.copy_dataset_indices = []
self.cache = []
self.batch_size = batch_size
self.use_embedding_cache = use_embedding_cache
self.embedding_cache = {}
self.num_sentences = 0
def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128):
"""
Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column
:param filepath: Filepath to the file
:param weight: If more than one dataset is loaded with load_data: With which frequency should data be sampled from this dataset?
:param max_sentences: Max number of lines to be read from filepath
:param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length
:param batch_size: Size for encoding parallel sentences
:return:
"""
logger.info("Load " + filepath)
parallel_sentences = []
with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
filepath, encoding="utf8"
) as fIn:
count = 0
for line in fIn:
sentences = line.strip().split("\t")
if (
max_sentence_length is not None
and max_sentence_length > 0
and max([len(sent) for sent in sentences]) > max_sentence_length
):
continue
parallel_sentences.append(sentences)
count += 1
if max_sentences is not None and max_sentences > 0 and count >= max_sentences:
break
self.add_dataset(
parallel_sentences, weight=weight, max_sentences=max_sentences, max_sentence_length=max_sentence_length
)
def add_dataset(
self,
parallel_sentences: List[List[str]],
weight: int = 100,
max_sentences: int = None,
max_sentence_length: int = 128,
):
sentences_map = {}
for sentences in parallel_sentences:
if (
max_sentence_length is not None
and max_sentence_length > 0
and max([len(sent) for sent in sentences]) > max_sentence_length
):
continue
source_sentence = sentences[0]
if source_sentence not in sentences_map:
sentences_map[source_sentence] = set()
for sent in sentences:
sentences_map[source_sentence].add(sent)
if max_sentences is not None and max_sentences > 0 and len(sentences_map) >= max_sentences:
break
if len(sentences_map) == 0:
return
self.num_sentences += sum([len(sentences_map[sent]) for sent in sentences_map])
dataset_id = len(self.datasets)
self.datasets.append(list(sentences_map.items()))
self.datasets_iterator.append(0)
self.dataset_indices.extend([dataset_id] * weight)
def generate_data(self):
source_sentences_list = []
target_sentences_list = []
for data_idx in self.dataset_indices:
src_sentence, trg_sentences = self.next_entry(data_idx)
source_sentences_list.append(src_sentence)
target_sentences_list.append(trg_sentences)
# Generate embeddings
src_embeddings = self.get_embeddings(source_sentences_list)
for src_embedding, trg_sentences in zip(src_embeddings, target_sentences_list):
for trg_sentence in trg_sentences:
self.cache.append(InputExample(texts=[trg_sentence], label=src_embedding))
random.shuffle(self.cache)
def next_entry(self, data_idx):
source, target_sentences = self.datasets[data_idx][self.datasets_iterator[data_idx]]
self.datasets_iterator[data_idx] += 1
if self.datasets_iterator[data_idx] >= len(self.datasets[data_idx]): # Restart iterator
self.datasets_iterator[data_idx] = 0
random.shuffle(self.datasets[data_idx])
return source, target_sentences
def get_embeddings(self, sentences):
if not self.use_embedding_cache:
return self.teacher_model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
)
# Use caching
new_sentences = []
for sent in sentences:
if sent not in self.embedding_cache:
new_sentences.append(sent)
if len(new_sentences) > 0:
new_embeddings = self.teacher_model.encode(
new_sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
)
for sent, embedding in zip(new_sentences, new_embeddings):
self.embedding_cache[sent] = embedding
return [self.embedding_cache[sent] for sent in sentences]
def __len__(self):
return self.num_sentences
def __getitem__(self, idx):
if len(self.cache) == 0:
self.generate_data()
return self.cache.pop()
""" """
from torch.utils.data import IterableDataset
import numpy as np
from typing import List
from ..readers import InputExample
import logging
logger = logging.getLogger(__name__)
class SentenceLabelDataset(IterableDataset):
"""
This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires
multiple examples with the same label in a batch.
It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
Labels with fewer than n unique samples are ignored.
This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
by the samples drawn per label.
"""
def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False):
"""
Creates a LabelSampler for a SentenceLabelDataset.
:param examples:
a list with InputExamples
:param samples_per_label:
the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
:param with_replacement:
if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
drawing.
"""
super().__init__()
self.samples_per_label = samples_per_label
# Group examples by label
label2ex = {}
for example in examples:
if example.label not in label2ex:
label2ex[example.label] = []
label2ex[example.label].append(example)
# Include only labels with at least 2 examples
self.grouped_inputs = []
self.groups_right_border = []
num_labels = 0
for label, label_examples in label2ex.items():
if len(label_examples) >= self.samples_per_label:
self.grouped_inputs.extend(label_examples)
self.groups_right_border.append(
len(self.grouped_inputs)
) # At which position does this label group / bucket end?
num_labels += 1
self.label_range = np.arange(num_labels)
self.with_replacement = with_replacement
np.random.shuffle(self.label_range)
logger.info(
"SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(
len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels
)
)
def __iter__(self):
label_idx = 0
count = 0
already_seen = {}
while count < len(self.grouped_inputs):
label = self.label_range[label_idx]
if label not in already_seen:
already_seen[label] = set()
left_border = 0 if label == 0 else self.groups_right_border[label - 1]
right_border = self.groups_right_border[label]
if self.with_replacement:
selection = np.arange(left_border, right_border)
else:
selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
if len(selection) >= self.samples_per_label:
for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
count += 1
already_seen[label].add(element_idx)
yield self.grouped_inputs[element_idx]
label_idx += 1
if label_idx >= len(self.label_range):
label_idx = 0
already_seen = {}
np.random.shuffle(self.label_range)
def __len__(self):
return len(self.grouped_inputs)
from torch.utils.data import Dataset
from typing import List
from .. import SentenceTransformer
from ..readers.InputExample import InputExample
class SentencesDataset(Dataset):
"""
DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
"""
def __init__(self, examples: List[InputExample], model: SentenceTransformer):
self.examples = examples
def __getitem__(self, item):
return self.examples[item]
def __len__(self):
return len(self.examples)
from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset
from .NoDuplicatesDataLoader import NoDuplicatesDataLoader
from .ParallelSentencesDataset import ParallelSentencesDataset
from .SentencesDataset import SentencesDataset
from .SentenceLabelDataset import SentenceLabelDataset
__all__ = [
"DenoisingAutoEncoderDataset",
"NoDuplicatesDataLoader",
"ParallelSentencesDataset",
"SentencesDataset",
"SentenceLabelDataset",
]
from . import SentenceEvaluator
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sklearn.metrics import average_precision_score
import numpy as np
from typing import List
from ..readers import InputExample
logger = logging.getLogger(__name__)
class BinaryClassificationEvaluator(SentenceEvaluator):
"""
Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and
dissimilar sentences.
The metrics are the cosine similarity as well as euclidean and Manhattan distance
The returned score is the accuracy with a specified metric.
The results are written in a CSV. If a CSV already exists, then values are appended.
The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
:param sentences1: The first column of sentences
:param sentences2: The second column of sentences
:param labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1
:param name: Name for the output
:param batch_size: Batch size used to compute embeddings
:param show_progress_bar: If true, prints a progress bar
:param write_csv: Write results to a CSV file
"""
def __init__(
self,
sentences1: List[str],
sentences2: List[str],
labels: List[int],
name: str = "",
batch_size: int = 32,
show_progress_bar: bool = False,
write_csv: bool = True,
):
self.sentences1 = sentences1
self.sentences2 = sentences2
self.labels = labels
assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.labels)
for label in labels:
assert label == 0 or label == 1
self.write_csv = write_csv
self.name = name
self.batch_size = batch_size
if show_progress_bar is None:
show_progress_bar = (
logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
)
self.show_progress_bar = show_progress_bar
self.csv_file = "binary_classification_evaluation" + ("_" + name if name else "") + "_results.csv"
self.csv_headers = [
"epoch",
"steps",
"cossim_accuracy",
"cossim_accuracy_threshold",
"cossim_f1",
"cossim_precision",
"cossim_recall",
"cossim_f1_threshold",
"cossim_ap",
"manhattan_accuracy",
"manhattan_accuracy_threshold",
"manhattan_f1",
"manhattan_precision",
"manhattan_recall",
"manhattan_f1_threshold",
"manhattan_ap",
"euclidean_accuracy",
"euclidean_accuracy_threshold",
"euclidean_f1",
"euclidean_precision",
"euclidean_recall",
"euclidean_f1_threshold",
"euclidean_ap",
"dot_accuracy",
"dot_accuracy_threshold",
"dot_f1",
"dot_precision",
"dot_recall",
"dot_f1_threshold",
"dot_ap",
]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentences1 = []
sentences2 = []
scores = []
for example in examples:
sentences1.append(example.texts[0])
sentences2.append(example.texts[1])
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = f" after epoch {epoch}:"
else:
out_txt = f" in epoch {epoch} after {steps} steps:"
else:
out_txt = ":"
logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model)
# Main score is the max of Average Precision (AP)
main_score = max(scores[short_name]["ap"] for short_name in scores)
file_output_data = [epoch, steps]
for header_name in self.csv_headers:
if "_" in header_name:
sim_fct, metric = header_name.split("_", maxsplit=1)
file_output_data.append(scores[sim_fct][metric])
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow(file_output_data)
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(file_output_data)
return main_score
def compute_metrices(self, model):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
sentences = list(set(self.sentences1 + self.sentences2))
embeddings = model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
embeddings1_np = np.asarray(embeddings1)
embeddings2_np = np.asarray(embeddings2)
dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
labels = np.asarray(self.labels)
output_scores = {}
for short_name, name, scores, reverse in [
["cossim", "Cosine-Similarity", cosine_scores, True],
["manhattan", "Manhattan-Distance", manhattan_distances, False],
["euclidean", "Euclidean-Distance", euclidean_distances, False],
["dot", "Dot-Product", dot_scores, True],
]:
acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
ap = average_precision_score(labels, scores * (1 if reverse else -1))
logger.info(
"Accuracy with {}: {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)
)
logger.info("F1 with {}: {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
logger.info("Precision with {}: {:.2f}".format(name, precision * 100))
logger.info("Recall with {}: {:.2f}".format(name, recall * 100))
logger.info("Average Precision with {}: {:.2f}\n".format(name, ap * 100))
output_scores[short_name] = {
"accuracy": acc,
"accuracy_threshold": acc_threshold,
"f1": f1,
"f1_threshold": f1_threshold,
"precision": precision,
"recall": recall,
"ap": ap,
}
return output_scores
@staticmethod
def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
assert len(scores) == len(labels)
rows = list(zip(scores, labels))
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
max_acc = 0
best_threshold = -1
positive_so_far = 0
remaining_negatives = sum(labels == 0)
for i in range(len(rows) - 1):
score, label = rows[i]
if label == 1:
positive_so_far += 1
else:
remaining_negatives -= 1
acc = (positive_so_far + remaining_negatives) / len(labels)
if acc > max_acc:
max_acc = acc
best_threshold = (rows[i][0] + rows[i + 1][0]) / 2
return max_acc, best_threshold
@staticmethod
def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
assert len(scores) == len(labels)
scores = np.asarray(scores)
labels = np.asarray(labels)
rows = list(zip(scores, labels))
rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
best_f1 = best_precision = best_recall = 0
threshold = 0
nextract = 0
ncorrect = 0
total_num_duplicates = sum(labels)
for i in range(len(rows) - 1):
score, label = rows[i]
nextract += 1
if label == 1:
ncorrect += 1
if ncorrect > 0:
precision = ncorrect / nextract
recall = ncorrect / total_num_duplicates
f1 = 2 * precision * recall / (precision + recall)
if f1 > best_f1:
best_f1 = f1
best_precision = precision
best_recall = recall
threshold = (rows[i][0] + rows[i + 1][0]) / 2
return best_f1, best_precision, best_recall, threshold
from contextlib import nullcontext
from . import SentenceEvaluator, SimilarityFunction
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
import numpy as np
from typing import List, Literal, Optional
from ..readers import InputExample
logger = logging.getLogger(__name__)
class EmbeddingSimilarityEvaluator(SentenceEvaluator):
"""
Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
in comparison to the gold standard labels.
The metrics are the cosine similarity as well as euclidean and Manhattan distance
The returned score is the Spearman correlation with a specified metric.
The results are written in a CSV. If a CSV already exists, then values are appended.
"""
def __init__(
self,
sentences1: List[str],
sentences2: List[str],
scores: List[float],
batch_size: int = 16,
main_similarity: SimilarityFunction = None,
name: str = "",
show_progress_bar: bool = False,
write_csv: bool = True,
precision: Optional[Literal["float32", "int8", "uint8", "binary", "ubinary"]] = None,
truncate_dim: Optional[int] = None,
):
"""
Constructs an evaluator based for the dataset
The labels need to indicate the similarity between the sentences.
:param sentences1: List with the first sentence in a pair
:param sentences2: List with the second sentence in a pair
:param scores: Similarity score between sentences1[i] and sentences2[i]
:param write_csv: Write results to a CSV file
:param precision: The precision to use for the embeddings. Can be "float32", "int8", "uint8", "binary", or
"ubinary". Defaults to None.
:param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current
truncation dimension. Defaults to None.
"""
self.sentences1 = sentences1
self.sentences2 = sentences2
self.scores = scores
self.write_csv = write_csv
self.precision = precision
self.truncate_dim = truncate_dim
assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.scores)
self.main_similarity = main_similarity
self.name = name
self.batch_size = batch_size
if show_progress_bar is None:
show_progress_bar = (
logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
)
self.show_progress_bar = show_progress_bar
self.csv_file = (
"similarity_evaluation"
+ ("_" + name if name else "")
+ ("_" + precision if precision else "")
+ "_results.csv"
)
self.csv_headers = [
"epoch",
"steps",
"cosine_pearson",
"cosine_spearman",
"euclidean_pearson",
"euclidean_spearman",
"manhattan_pearson",
"manhattan_spearman",
"dot_pearson",
"dot_spearman",
]
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
sentences1 = []
sentences2 = []
scores = []
for example in examples:
sentences1.append(example.texts[0])
sentences2.append(example.texts[1])
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
embeddings1 = model.encode(
self.sentences1,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
precision=self.precision,
normalize_embeddings=bool(self.precision),
)
embeddings2 = model.encode(
self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
precision=self.precision,
normalize_embeddings=bool(self.precision),
)
# Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
if self.precision == "binary":
embeddings1 = (embeddings1 + 128).astype(np.uint8)
embeddings2 = (embeddings2 + 128).astype(np.uint8)
if self.precision in ("ubinary", "binary"):
embeddings1 = np.unpackbits(embeddings1, axis=1)
embeddings2 = np.unpackbits(embeddings2, axis=1)
labels = self.scores
cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
eval_pearson_dot, _ = pearsonr(labels, dot_products)
eval_spearman_dot, _ = spearmanr(labels, dot_products)
logger.info(
"Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_cosine, eval_spearman_cosine)
)
logger.info(
"Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
eval_pearson_manhattan, eval_spearman_manhattan
)
)
logger.info(
"Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
eval_pearson_euclidean, eval_spearman_euclidean
)
)
logger.info(
"Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_dot, eval_spearman_dot)
)
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow(
[
epoch,
steps,
eval_pearson_cosine,
eval_spearman_cosine,
eval_pearson_euclidean,
eval_spearman_euclidean,
eval_pearson_manhattan,
eval_spearman_manhattan,
eval_pearson_dot,
eval_spearman_dot,
]
)
if self.main_similarity == SimilarityFunction.COSINE:
return eval_spearman_cosine
elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
return eval_spearman_euclidean
elif self.main_similarity == SimilarityFunction.MANHATTAN:
return eval_spearman_manhattan
elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
return eval_spearman_dot
elif self.main_similarity is None:
return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
else:
raise ValueError("Unknown main_similarity value")
from . import SentenceEvaluator
import torch
from torch import Tensor
import logging
from tqdm import trange
from ..util import cos_sim, dot_score
import os
import numpy as np
from typing import List, Dict, Set, Callable
import heapq
logger = logging.getLogger(__name__)
class InformationRetrievalEvaluator(SentenceEvaluator):
"""
This class evaluates an Information Retrieval (IR) setting.
Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
"""
def __init__(
self,
queries: Dict[str, str], # qid => query
corpus: Dict[str, str], # cid => doc
relevant_docs: Dict[str, Set[str]], # qid => Set[cid]
corpus_chunk_size: int = 50000,
mrr_at_k: List[int] = [10],
ndcg_at_k: List[int] = [10],
accuracy_at_k: List[int] = [1, 3, 5, 10],
precision_recall_at_k: List[int] = [1, 3, 5, 10],
map_at_k: List[int] = [100],
show_progress_bar: bool = False,
batch_size: int = 32,
name: str = "",
write_csv: bool = True,
score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = {
"cos_sim": cos_sim,
"dot_score": dot_score,
}, # Score function, higher=more similar
main_score_function: str = None,
):
self.queries_ids = []
for qid in queries:
if qid in relevant_docs and len(relevant_docs[qid]) > 0:
self.queries_ids.append(qid)
self.queries = [queries[qid] for qid in self.queries_ids]
self.corpus_ids = list(corpus.keys())
self.corpus = [corpus[cid] for cid in self.corpus_ids]
self.relevant_docs = relevant_docs
self.corpus_chunk_size = corpus_chunk_size
self.mrr_at_k = mrr_at_k
self.ndcg_at_k = ndcg_at_k
self.accuracy_at_k = accuracy_at_k
self.precision_recall_at_k = precision_recall_at_k
self.map_at_k = map_at_k
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.name = name
self.write_csv = write_csv
self.score_functions = score_functions
self.score_function_names = sorted(list(self.score_functions.keys()))
self.main_score_function = main_score_function
if name:
name = "_" + name
self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps"]
for score_name in self.score_function_names:
for k in accuracy_at_k:
self.csv_headers.append("{}-Accuracy@{}".format(score_name, k))
for k in precision_recall_at_k:
self.csv_headers.append("{}-Precision@{}".format(score_name, k))
self.csv_headers.append("{}-Recall@{}".format(score_name, k))
for k in mrr_at_k:
self.csv_headers.append("{}-MRR@{}".format(score_name, k))
for k in ndcg_at_k:
self.csv_headers.append("{}-NDCG@{}".format(score_name, k))
for k in map_at_k:
self.csv_headers.append("{}-MAP@{}".format(score_name, k))
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
if epoch != -1:
out_txt = (
" after epoch {}:".format(epoch)
if steps == -1
else " in epoch {} after {} steps:".format(epoch, steps)
)
else:
out_txt = ":"
logger.info("Information Retrieval Evaluation on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model, *args, **kwargs)
# Write results to disc
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
fOut = open(csv_path, mode="w", encoding="utf-8")
fOut.write(",".join(self.csv_headers))
fOut.write("\n")
else:
fOut = open(csv_path, mode="a", encoding="utf-8")
output_data = [epoch, steps]
for name in self.score_function_names:
for k in self.accuracy_at_k:
output_data.append(scores[name]["accuracy@k"][k])
for k in self.precision_recall_at_k:
output_data.append(scores[name]["precision@k"][k])
output_data.append(scores[name]["recall@k"][k])
for k in self.mrr_at_k:
output_data.append(scores[name]["mrr@k"][k])
for k in self.ndcg_at_k:
output_data.append(scores[name]["ndcg@k"][k])
for k in self.map_at_k:
output_data.append(scores[name]["map@k"][k])
fOut.write(",".join(map(str, output_data)))
fOut.write("\n")
fOut.close()
if self.main_score_function is None:
return max([scores[name]["map@k"][max(self.map_at_k)] for name in self.score_function_names])
else:
return scores[self.main_score_function]["map@k"][max(self.map_at_k)]
def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor = None) -> Dict[str, float]:
if corpus_model is None:
corpus_model = model
max_k = max(
max(self.mrr_at_k),
max(self.ndcg_at_k),
max(self.accuracy_at_k),
max(self.precision_recall_at_k),
max(self.map_at_k),
)
# Compute embedding for the queries
query_embeddings = model.encode(
self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True
)
queries_result_list = {}
for name in self.score_functions:
queries_result_list[name] = [[] for _ in range(len(query_embeddings))]
# Iterate over chunks of the corpus
for corpus_start_idx in trange(
0, len(self.corpus), self.corpus_chunk_size, desc="Corpus Chunks", disable=not self.show_progress_bar
):
corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus))
# Encode chunk of corpus
if corpus_embeddings is None:
sub_corpus_embeddings = corpus_model.encode(
self.corpus[corpus_start_idx:corpus_end_idx],
show_progress_bar=False,
batch_size=self.batch_size,
convert_to_tensor=True,
)
else:
sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]
# Compute cosine similarites
for name, score_function in self.score_functions.items():
pair_scores = score_function(query_embeddings, sub_corpus_embeddings)
# Get top-k values
pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(
pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False
)
pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()
for query_itr in range(len(query_embeddings)):
for sub_corpus_id, score in zip(
pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]
):
corpus_id = self.corpus_ids[corpus_start_idx + sub_corpus_id]
if len(queries_result_list[name][query_itr]) < max_k:
heapq.heappush(
queries_result_list[name][query_itr], (score, corpus_id)
) # heaqp tracks the quantity of the first element in the tuple
else:
heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id))
for name in queries_result_list:
for query_itr in range(len(queries_result_list[name])):
for doc_itr in range(len(queries_result_list[name][query_itr])):
score, corpus_id = queries_result_list[name][query_itr][doc_itr]
queries_result_list[name][query_itr][doc_itr] = {"corpus_id": corpus_id, "score": score}
logger.info("Queries: {}".format(len(self.queries)))
logger.info("Corpus: {}\n".format(len(self.corpus)))
# Compute scores
scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions}
# Output
for name in self.score_function_names:
logger.info("Score-Function: {}".format(name))
self.output_scores(scores[name])
return scores
def compute_metrics(self, queries_result_list: List[object]):
# Init score computation values
num_hits_at_k = {k: 0 for k in self.accuracy_at_k}
precisions_at_k = {k: [] for k in self.precision_recall_at_k}
recall_at_k = {k: [] for k in self.precision_recall_at_k}
MRR = {k: 0 for k in self.mrr_at_k}
ndcg = {k: [] for k in self.ndcg_at_k}
AveP_at_k = {k: [] for k in self.map_at_k}
# Compute scores on results
for query_itr in range(len(queries_result_list)):
query_id = self.queries_ids[query_itr]
# Sort scores
top_hits = sorted(queries_result_list[query_itr], key=lambda x: x["score"], reverse=True)
query_relevant_docs = self.relevant_docs[query_id]
# Accuracy@k - We count the result correct, if at least one relevant doc is across the top-k documents
for k_val in self.accuracy_at_k:
for hit in top_hits[0:k_val]:
if hit["corpus_id"] in query_relevant_docs:
num_hits_at_k[k_val] += 1
break
# Precision and Recall@k
for k_val in self.precision_recall_at_k:
num_correct = 0
for hit in top_hits[0:k_val]:
if hit["corpus_id"] in query_relevant_docs:
num_correct += 1
precisions_at_k[k_val].append(num_correct / k_val)
recall_at_k[k_val].append(num_correct / len(query_relevant_docs))
# MRR@k
for k_val in self.mrr_at_k:
for rank, hit in enumerate(top_hits[0:k_val]):
if hit["corpus_id"] in query_relevant_docs:
MRR[k_val] += 1.0 / (rank + 1)
break
# NDCG@k
for k_val in self.ndcg_at_k:
predicted_relevance = [
1 if top_hit["corpus_id"] in query_relevant_docs else 0 for top_hit in top_hits[0:k_val]
]
true_relevances = [1] * len(query_relevant_docs)
ndcg_value = self.compute_dcg_at_k(predicted_relevance, k_val) / self.compute_dcg_at_k(
true_relevances, k_val
)
ndcg[k_val].append(ndcg_value)
# MAP@k
for k_val in self.map_at_k:
num_correct = 0
sum_precisions = 0
for rank, hit in enumerate(top_hits[0:k_val]):
if hit["corpus_id"] in query_relevant_docs:
num_correct += 1
sum_precisions += num_correct / (rank + 1)
avg_precision = sum_precisions / min(k_val, len(query_relevant_docs))
AveP_at_k[k_val].append(avg_precision)
# Compute averages
for k in num_hits_at_k:
num_hits_at_k[k] /= len(self.queries)
for k in precisions_at_k:
precisions_at_k[k] = np.mean(precisions_at_k[k])
for k in recall_at_k:
recall_at_k[k] = np.mean(recall_at_k[k])
for k in ndcg:
ndcg[k] = np.mean(ndcg[k])
for k in MRR:
MRR[k] /= len(self.queries)
for k in AveP_at_k:
AveP_at_k[k] = np.mean(AveP_at_k[k])
return {
"accuracy@k": num_hits_at_k,
"precision@k": precisions_at_k,
"recall@k": recall_at_k,
"ndcg@k": ndcg,
"mrr@k": MRR,
"map@k": AveP_at_k,
}
def output_scores(self, scores):
for k in scores["accuracy@k"]:
logger.info("Accuracy@{}: {:.2f}%".format(k, scores["accuracy@k"][k] * 100))
for k in scores["precision@k"]:
logger.info("Precision@{}: {:.2f}%".format(k, scores["precision@k"][k] * 100))
for k in scores["recall@k"]:
logger.info("Recall@{}: {:.2f}%".format(k, scores["recall@k"][k] * 100))
for k in scores["mrr@k"]:
logger.info("MRR@{}: {:.4f}".format(k, scores["mrr@k"][k]))
for k in scores["ndcg@k"]:
logger.info("NDCG@{}: {:.4f}".format(k, scores["ndcg@k"][k]))
for k in scores["map@k"]:
logger.info("MAP@{}: {:.4f}".format(k, scores["map@k"][k]))
@staticmethod
def compute_dcg_at_k(relevances, k):
dcg = 0
for i in range(min(len(relevances), k)):
dcg += relevances[i] / np.log2(i + 2) # +2 as we start our idx at 0
return dcg
from . import SentenceEvaluator
import torch
from torch.utils.data import DataLoader
import logging
from ..util import batch_to_device
import os
import csv
logger = logging.getLogger(__name__)
class LabelAccuracyEvaluator(SentenceEvaluator):
"""
Evaluate a model based on its accuracy on a labeled dataset
This requires a model with LossFunction.SOFTMAX
The results are written in a CSV. If a CSV already exists, then values are appended.
"""
def __init__(self, dataloader: DataLoader, name: str = "", softmax_model=None, write_csv: bool = True):
"""
Constructs an evaluator for the given dataset
:param dataloader:
the data for the evaluation
"""
self.dataloader = dataloader
self.name = name
self.softmax_model = softmax_model
if name:
name = "_" + name
self.write_csv = write_csv
self.csv_file = "accuracy_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "accuracy"]
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
model.eval()
total = 0
correct = 0
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("Evaluation on the " + self.name + " dataset" + out_txt)
self.dataloader.collate_fn = model.smart_batching_collate
for step, batch in enumerate(self.dataloader):
features, label_ids = batch
for idx in range(len(features)):
features[idx] = batch_to_device(features[idx], model.device)
label_ids = label_ids.to(model.device)
with torch.no_grad():
_, prediction = self.softmax_model(features, labels=None)
total += prediction.size(0)
correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
accuracy = correct / total
logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, accuracy])
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([epoch, steps, accuracy])
return accuracy
from sentence_transformers.evaluation import SentenceEvaluator
import logging
import os
import csv
from typing import List
logger = logging.getLogger(__name__)
class MSEEvaluator(SentenceEvaluator):
"""
Computes the mean squared error (x100) between the computed sentence embedding
and some target sentence embedding.
The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
and target_sentences are in a different language like German, Chinese, Spanish...
:param source_sentences: Source sentences are embedded with the teacher model
:param target_sentences: Target sentences are ambedding with the student model.
:param show_progress_bar: Show progress bar when computing embeddings
:param batch_size: Batch size to compute sentence embeddings
:param name: Name of the evaluator
:param write_csv: Write results to CSV file
"""
def __init__(
self,
source_sentences: List[str],
target_sentences: List[str],
teacher_model=None,
show_progress_bar: bool = False,
batch_size: int = 32,
name: str = "",
write_csv: bool = True,
):
self.source_embeddings = teacher_model.encode(
source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True
)
self.target_sentences = target_sentences
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.name = name
self.csv_file = "mse_evaluation_" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "MSE"]
self.write_csv = write_csv
def __call__(self, model, output_path, epoch=-1, steps=-1):
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
target_embeddings = model.encode(
self.target_sentences,
show_progress_bar=self.show_progress_bar,
batch_size=self.batch_size,
convert_to_numpy=True,
)
mse = ((self.source_embeddings - target_embeddings) ** 2).mean()
mse *= 100
logger.info("MSE evaluation (lower = better) on " + self.name + " dataset" + out_txt)
logger.info("MSE (*100):\t{:4f}".format(mse))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, mse])
return -mse # Return negative score as SentenceTransformers maximizes the performance
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Dict
import numpy as np
import logging
import os
import csv
logger = logging.getLogger(__name__)
class MSEEvaluatorFromDataFrame(SentenceEvaluator):
"""
Computes the mean squared error (x100) between the computed sentence embedding and some target sentence embedding.
:param dataframe: It must have the following format. Rows contains different, parallel sentences.
Columns are the respective language codes::
[{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
{'en': 'My second sentence', ...}]
:param combinations: Must be of the format ``[('en', 'es'), ('en', 'fr'), ...]``.
First entry in a tuple is the source language. The sentence in the respective language will be fetched from
the dataframe and passed to the teacher model. Second entry in a tuple the the target language. Sentence
will be fetched from the dataframe and passed to the student model
"""
def __init__(
self,
dataframe: List[Dict[str, str]],
teacher_model: SentenceTransformer,
combinations: List[Tuple[str, str]],
batch_size: int = 8,
name="",
write_csv: bool = True,
):
self.combinations = combinations
self.name = name
self.batch_size = batch_size
if name:
name = "_" + name
self.csv_file = "mse_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps"]
self.write_csv = write_csv
self.data = {}
logger.info("Compute teacher embeddings")
all_source_sentences = set()
for src_lang, trg_lang in self.combinations:
src_sentences = []
trg_sentences = []
for row in dataframe:
if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
all_source_sentences.add(row[src_lang])
src_sentences.append(row[src_lang])
trg_sentences.append(row[trg_lang])
self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
all_source_sentences = list(all_source_sentences)
all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1):
model.eval()
mse_scores = []
for src_lang, trg_lang in self.combinations:
src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
mse = ((src_embeddings - trg_embeddings) ** 2).mean()
mse *= 100
mse_scores.append(mse)
logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
logger.info("MSE (*100):\t{:4f}".format(mse))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps] + mse_scores)
return -np.mean(mse_scores) # Return negative score as SentenceTransformers maximizes the performance
from . import SentenceEvaluator
import logging
from sentence_transformers.util import paraphrase_mining
import os
import csv
from typing import List, Tuple, Dict
from collections import defaultdict
logger = logging.getLogger(__name__)
class ParaphraseMiningEvaluator(SentenceEvaluator):
"""
Given a large set of sentences, this evaluator performs paraphrase (duplicate) mining and
identifies the pairs with the highest similarity. It compare the extracted paraphrase pairs
with a set of gold labels and computes the F1 score.
"""
def __init__(
self,
sentences_map: Dict[str, str],
duplicates_list: List[Tuple[str, str]] = None,
duplicates_dict: Dict[str, Dict[str, bool]] = None,
add_transitive_closure: bool = False,
query_chunk_size: int = 5000,
corpus_chunk_size: int = 100000,
max_pairs: int = 500000,
top_k: int = 100,
show_progress_bar: bool = False,
batch_size: int = 16,
name: str = "",
write_csv: bool = True,
):
"""
:param sentences_map: A dictionary that maps sentence-ids to sentences, i.e. sentences_map[id] => sentence.
:param duplicates_list: Duplicates_list is a list with id pairs [(id1, id2), (id1, id5)] that identifies the duplicates / paraphrases in the sentences_map
:param duplicates_dict: A default dictionary mapping [id1][id2] to true if id1 and id2 are duplicates. Must be symmetric, i.e., if [id1][id2] => True, then [id2][id1] => True.
:param add_transitive_closure: If true, it adds a transitive closure, i.e. if dup[a][b] and dup[b][c], then dup[a][c]
:param query_chunk_size: To identify the paraphrases, the cosine-similarity between all sentence-pairs will be computed. As this might require a lot of memory, we perform a batched computation. #query_batch_size sentences will be compared against up to #corpus_batch_size sentences. In the default setting, 5000 sentences will be grouped together and compared up-to against 100k other sentences.
:param corpus_chunk_size: The corpus will be batched, to reduce the memory requirement
:param max_pairs: We will only extract up to #max_pairs potential paraphrase candidates.
:param top_k: For each query, we extract the top_k most similar pairs and add it to a sorted list. I.e., for one sentence we cannot find more than top_k paraphrases
:param show_progress_bar: Output a progress bar
:param batch_size: Batch size for computing sentence embeddings
:param name: Name of the experiment
:param write_csv: Write results to CSV file
"""
self.sentences = []
self.ids = []
for id, sentence in sentences_map.items():
self.sentences.append(sentence)
self.ids.append(id)
self.name = name
self.show_progress_bar = show_progress_bar
self.batch_size = batch_size
self.query_chunk_size = query_chunk_size
self.corpus_chunk_size = corpus_chunk_size
self.max_pairs = max_pairs
self.top_k = top_k
self.duplicates = duplicates_dict if duplicates_dict is not None else defaultdict(lambda: defaultdict(bool))
if duplicates_list is not None:
for id1, id2 in duplicates_list:
if id1 in sentences_map and id2 in sentences_map:
self.duplicates[id1][id2] = True
self.duplicates[id2][id1] = True
# Add transitive closure
if add_transitive_closure:
self.duplicates = self.add_transitive_closure(self.duplicates)
positive_key_pairs = set()
for key1 in self.duplicates:
for key2 in self.duplicates[key1]:
if (
key1 in sentences_map
and key2 in sentences_map
and (self.duplicates[key1][key2] or self.duplicates[key2][key1])
):
positive_key_pairs.add(tuple(sorted([key1, key2])))
self.total_num_duplicates = len(positive_key_pairs)
if name:
name = "_" + name
self.csv_file: str = "paraphrase_mining_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "precision", "recall", "f1", "threshold", "average_precision"]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:"
else:
out_txt = ":"
logger.info("Paraphrase Mining Evaluation on " + self.name + " dataset" + out_txt)
# Compute embedding for the sentences
pairs_list = paraphrase_mining(
model,
self.sentences,
self.show_progress_bar,
self.batch_size,
self.query_chunk_size,
self.corpus_chunk_size,
self.max_pairs,
self.top_k,
)
logger.info("Number of candidate pairs: " + str(len(pairs_list)))
# Compute F1 score and Average Precision
n_extract = n_correct = 0
threshold = 0
best_f1 = best_recall = best_precision = 0
average_precision = 0
for idx in range(len(pairs_list)):
score, i, j = pairs_list[idx]
id1 = self.ids[i]
id2 = self.ids[j]
# Compute optimal threshold and F1-score
n_extract += 1
if self.duplicates[id1][id2] or self.duplicates[id2][id1]:
n_correct += 1
precision = n_correct / n_extract
recall = n_correct / self.total_num_duplicates
f1 = 2 * precision * recall / (precision + recall)
average_precision += precision
if f1 > best_f1:
best_f1 = f1
best_precision = precision
best_recall = recall
threshold = (pairs_list[idx][0] + pairs_list[min(idx + 1, len(pairs_list) - 1)][0]) / 2
average_precision = average_precision / self.total_num_duplicates
logger.info("Average Precision: {:.2f}".format(average_precision * 100))
logger.info("Optimal threshold: {:.4f}".format(threshold))
logger.info("Precision: {:.2f}".format(best_precision * 100))
logger.info("Recall: {:.2f}".format(best_recall * 100))
logger.info("F1: {:.2f}\n".format(best_f1 * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
return average_precision
@staticmethod
def add_transitive_closure(graph):
nodes_visited = set()
for a in list(graph.keys()):
if a not in nodes_visited:
connected_subgraph_nodes = set()
connected_subgraph_nodes.add(a)
# Add all nodes in the connected graph
neighbor_nodes_queue = list(graph[a])
while len(neighbor_nodes_queue) > 0:
node = neighbor_nodes_queue.pop(0)
if node not in connected_subgraph_nodes:
connected_subgraph_nodes.add(node)
neighbor_nodes_queue.extend(graph[node])
# Ensure transitivity between all nodes in the graph
connected_subgraph_nodes = list(connected_subgraph_nodes)
for i in range(len(connected_subgraph_nodes) - 1):
for j in range(i + 1, len(connected_subgraph_nodes)):
graph[connected_subgraph_nodes[i]][connected_subgraph_nodes[j]] = True
graph[connected_subgraph_nodes[j]][connected_subgraph_nodes[i]] = True
nodes_visited.add(connected_subgraph_nodes[i])
nodes_visited.add(connected_subgraph_nodes[j])
return graph
from . import SentenceEvaluator
import logging
import numpy as np
import os
import csv
from ..util import cos_sim
import torch
from sklearn.metrics import average_precision_score, ndcg_score
import tqdm
from typing import Optional
logger = logging.getLogger(__name__)
class RerankingEvaluator(SentenceEvaluator):
"""
This class evaluates a SentenceTransformer model for the task of re-ranking.
Given a query and a list of documents, it computes the score [query, doc_i] for all possible
documents and sorts them in decreasing order. Then, MRR@10, NDCG@10 and MAP is compute to measure the quality of the ranking.
:param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
"""
def __init__(
self,
samples,
at_k: int = 10,
name: str = "",
write_csv: bool = True,
similarity_fct=cos_sim,
batch_size: int = 64,
show_progress_bar: bool = False,
use_batched_encoding: bool = True,
mrr_at_k: Optional[int] = None,
):
self.samples = samples
self.name = name
if mrr_at_k is not None:
logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
self.at_k = mrr_at_k
else:
self.at_k = at_k
self.similarity_fct = similarity_fct
self.batch_size = batch_size
self.show_progress_bar = show_progress_bar
self.use_batched_encoding = use_batched_encoding
if isinstance(self.samples, dict):
self.samples = list(self.samples.values())
### Remove sample with empty positive / negative set
self.samples = [
sample for sample in self.samples if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
]
self.csv_file = "RerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
self.csv_headers = [
"epoch",
"steps",
"MAP",
"MRR@{}".format(self.at_k),
"NDCG@{}".format(self.at_k),
]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
scores = self.compute_metrices(model)
mean_ap = scores["map"]
mean_mrr = scores["mrr"]
mean_ndcg = scores["ndcg"]
#### Some stats about the dataset
num_positives = [len(sample["positive"]) for sample in self.samples]
num_negatives = [len(sample["negative"]) for sample in self.samples]
logger.info(
"Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
len(self.samples),
np.min(num_positives),
np.mean(num_positives),
np.max(num_positives),
np.min(num_negatives),
np.mean(num_negatives),
np.max(num_negatives),
)
)
logger.info("MAP: {:.2f}".format(mean_ap * 100))
logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
#### Write results to disc
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, mean_ap, mean_mrr, mean_ndcg])
return mean_ap
def compute_metrices(self, model):
return (
self.compute_metrices_batched(model)
if self.use_batched_encoding
else self.compute_metrices_individual(model)
)
def compute_metrices_batched(self, model):
"""
Computes the metrices in a batched way, by batching all queries and
all documents together
"""
all_mrr_scores = []
all_ndcg_scores = []
all_ap_scores = []
all_query_embs = model.encode(
[sample["query"] for sample in self.samples],
convert_to_tensor=True,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
)
all_docs = []
for sample in self.samples:
all_docs.extend(sample["positive"])
all_docs.extend(sample["negative"])
all_docs_embs = model.encode(
all_docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar
)
# Compute scores
query_idx, docs_idx = 0, 0
for instance in self.samples:
query_emb = all_query_embs[query_idx]
query_idx += 1
num_pos = len(instance["positive"])
num_neg = len(instance["negative"])
docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg]
docs_idx += num_pos + num_neg
if num_pos == 0 or num_neg == 0:
continue
pred_scores = self.similarity_fct(query_emb, docs_emb)
if len(pred_scores.shape) > 1:
pred_scores = pred_scores[0]
pred_scores_argsort = torch.argsort(-pred_scores) # Sort in decreasing order
pred_scores = pred_scores.cpu().tolist()
# Compute MRR score
is_relevant = [1] * num_pos + [0] * num_neg
mrr_score = 0
for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
if is_relevant[index]:
mrr_score = 1 / (rank + 1)
break
all_mrr_scores.append(mrr_score)
# Compute NDCG score
all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
# Compute AP
all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
mean_ap = np.mean(all_ap_scores)
mean_mrr = np.mean(all_mrr_scores)
mean_ndcg = np.mean(all_ndcg_scores)
return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
def compute_metrices_individual(self, model):
"""
Embeds every (query, positive, negative) tuple individually.
Is slower than the batched version, but saves memory as only the
embeddings for one tuple are needed. Useful when you have
a really large test set
"""
all_mrr_scores = []
all_ndcg_scores = []
all_ap_scores = []
for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"):
query = instance["query"]
positive = list(instance["positive"])
negative = list(instance["negative"])
if len(positive) == 0 or len(negative) == 0:
continue
docs = positive + negative
is_relevant = [1] * len(positive) + [0] * len(negative)
query_emb = model.encode(
[query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False
)
docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
pred_scores = self.similarity_fct(query_emb, docs_emb)
if len(pred_scores.shape) > 1:
pred_scores = pred_scores[0]
pred_scores_argsort = torch.argsort(-pred_scores) # Sort in decreasing order
pred_scores = pred_scores.cpu().tolist()
# Compute MRR score
mrr_score = 0
for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
if is_relevant[index]:
mrr_score = 1 / (rank + 1)
break
all_mrr_scores.append(mrr_score)
# Compute NDCG score
all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
# Compute AP
all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
mean_ap = np.mean(all_ap_scores)
mean_mrr = np.mean(all_mrr_scores)
mean_ndcg = np.mean(all_ndcg_scores)
return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
class SentenceEvaluator:
"""
Base class for all evaluators
Extend this class and implement __call__ for custom evaluators.
"""
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
"""
This is called during training to evaluate the model.
It returns a score for the evaluation with a higher score indicating a better result.
:param model:
the model to evaluate
:param output_path:
path where predictions and metrics are written to
:param epoch
the epoch where the evaluation takes place.
This is used for the file prefixes.
If this is -1, then we assume evaluation on test data.
:param steps
the steps in the current epoch at time of the evaluation.
This is used for the file prefixes.
If this is -1, then we assume evaluation at the end of the epoch.
:return: a score for the evaluation with a higher score indicating a better result
"""
pass
from . import SentenceEvaluator
from typing import Iterable
class SequentialEvaluator(SentenceEvaluator):
"""
This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
the data is passed sequentially to all sub-evaluators.
All scores are passed to 'main_score_function', which derives one final score value
"""
def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function=lambda scores: scores[-1]):
self.evaluators = evaluators
self.main_score_function = main_score_function
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
scores = []
for evaluator in self.evaluators:
scores.append(evaluator(model, output_path, epoch, steps))
return self.main_score_function(scores)
from enum import Enum
class SimilarityFunction(Enum):
COSINE = 0
EUCLIDEAN = 1
MANHATTAN = 2
DOT_PRODUCT = 3
from . import SentenceEvaluator
import logging
from ..util import pytorch_cos_sim
import os
import csv
import numpy as np
from typing import List
import torch
logger = logging.getLogger(__name__)
class TranslationEvaluator(SentenceEvaluator):
"""
Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...),
and assuming that fr_i is the translation of en_i.
Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accuracy in both directions
"""
def __init__(
self,
source_sentences: List[str],
target_sentences: List[str],
show_progress_bar: bool = False,
batch_size: int = 16,
name: str = "",
print_wrong_matches: bool = False,
write_csv: bool = True,
):
"""
Constructs an evaluator based for the dataset
The labels need to indicate the similarity between the sentences.
:param source_sentences:
List of sentences in source language
:param target_sentences:
List of sentences in target language
:param print_wrong_matches:
Prints incorrect matches
:param write_csv:
Write results to CSV file
"""
self.source_sentences = source_sentences
self.target_sentences = target_sentences
self.name = name
self.batch_size = batch_size
self.show_progress_bar = show_progress_bar
self.print_wrong_matches = print_wrong_matches
assert len(self.source_sentences) == len(self.target_sentences)
if name:
name = "_" + name
self.csv_file = "translation_evaluation" + name + "_results.csv"
self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"]
self.write_csv = write_csv
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("Evaluating translation matching Accuracy on " + self.name + " dataset" + out_txt)
embeddings1 = torch.stack(
model.encode(
self.source_sentences,
show_progress_bar=self.show_progress_bar,
batch_size=self.batch_size,
convert_to_numpy=False,
)
)
embeddings2 = torch.stack(
model.encode(
self.target_sentences,
show_progress_bar=self.show_progress_bar,
batch_size=self.batch_size,
convert_to_numpy=False,
)
)
cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy()
correct_src2trg = 0
correct_trg2src = 0
for i in range(len(cos_sims)):
max_idx = np.argmax(cos_sims[i])
if i == max_idx:
correct_src2trg += 1
elif self.print_wrong_matches:
print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT")
print("Src:", self.source_sentences[i])
print("Trg:", self.target_sentences[max_idx])
print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i])
results = zip(range(len(cos_sims[i])), cos_sims[i])
results = sorted(results, key=lambda x: x[1], reverse=True)
for idx, score in results[0:5]:
print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx])
cos_sims = cos_sims.T
for i in range(len(cos_sims)):
max_idx = np.argmax(cos_sims[i])
if i == max_idx:
correct_trg2src += 1
acc_src2trg = correct_src2trg / len(cos_sims)
acc_trg2src = correct_trg2src / len(cos_sims)
logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg * 100))
logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
output_file_exists = os.path.isfile(csv_path)
with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
writer = csv.writer(f)
if not output_file_exists:
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, acc_src2trg, acc_trg2src])
return (acc_src2trg + acc_trg2src) / 2
from . import SentenceEvaluator, SimilarityFunction
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from typing import List
from ..readers import InputExample
logger = logging.getLogger(__name__)
class TripletEvaluator(SentenceEvaluator):
"""
Evaluate a model based on a triplet: (sentence, positive_example, negative_example).
Checks if distance(sentence, positive_example) < distance(sentence, negative_example).
"""
def __init__(
self,
anchors: List[str],
positives: List[str],
negatives: List[str],
main_distance_function: SimilarityFunction = None,
name: str = "",
batch_size: int = 16,
show_progress_bar: bool = False,
write_csv: bool = True,
):
"""
:param anchors: Sentences to check similarity to. (e.g. a query)
:param positives: List of positive sentences
:param negatives: List of negative sentences
:param main_distance_function: One of 0 (Cosine), 1 (Euclidean) or 2 (Manhattan). Defaults to None, returning all 3.
:param name: Name for the output
:param batch_size: Batch size used to compute embeddings
:param show_progress_bar: If true, prints a progress bar
:param write_csv: Write results to a CSV file
"""
self.anchors = anchors
self.positives = positives
self.negatives = negatives
self.name = name
assert len(self.anchors) == len(self.positives)
assert len(self.anchors) == len(self.negatives)
self.main_distance_function = main_distance_function
self.batch_size = batch_size
if show_progress_bar is None:
show_progress_bar = (
logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
)
self.show_progress_bar = show_progress_bar
self.csv_file: str = "triplet_evaluation" + ("_" + name if name else "") + "_results.csv"
self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhattan", "accuracy_euclidean"]
self.write_csv = write_csv
@classmethod
def from_input_examples(cls, examples: List[InputExample], **kwargs):
anchors = []
positives = []
negatives = []
for example in examples:
anchors.append(example.texts[0])
positives.append(example.texts[1])
negatives.append(example.texts[2])
return cls(anchors, positives, negatives, **kwargs)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
else:
out_txt = ":"
logger.info("TripletEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
num_triplets = 0
num_correct_cos_triplets, num_correct_manhattan_triplets, num_correct_euclidean_triplets = 0, 0, 0
embeddings_anchors = model.encode(
self.anchors, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
embeddings_positives = model.encode(
self.positives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
embeddings_negatives = model.encode(
self.negatives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
# Cosine distance
pos_cos_distance = paired_cosine_distances(embeddings_anchors, embeddings_positives)
neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives)
# Manhattan
pos_manhattan_distance = paired_manhattan_distances(embeddings_anchors, embeddings_positives)
neg_manhattan_distances = paired_manhattan_distances(embeddings_anchors, embeddings_negatives)
# Euclidean
pos_euclidean_distance = paired_euclidean_distances(embeddings_anchors, embeddings_positives)
neg_euclidean_distances = paired_euclidean_distances(embeddings_anchors, embeddings_negatives)
for idx in range(len(pos_cos_distance)):
num_triplets += 1
if pos_cos_distance[idx] < neg_cos_distances[idx]:
num_correct_cos_triplets += 1
if pos_manhattan_distance[idx] < neg_manhattan_distances[idx]:
num_correct_manhattan_triplets += 1
if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
num_correct_euclidean_triplets += 1
accuracy_cos = num_correct_cos_triplets / num_triplets
accuracy_manhattan = num_correct_manhattan_triplets / num_triplets
accuracy_euclidean = num_correct_euclidean_triplets / num_triplets
logger.info("Accuracy Cosine Distance: \t{:.2f}".format(accuracy_cos * 100))
logger.info("Accuracy Manhattan Distance:\t{:.2f}".format(accuracy_manhattan * 100))
logger.info("Accuracy Euclidean Distance:\t{:.2f}\n".format(accuracy_euclidean * 100))
if output_path is not None and self.write_csv:
csv_path = os.path.join(output_path, self.csv_file)
if not os.path.isfile(csv_path):
with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(self.csv_headers)
writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
else:
with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
if self.main_distance_function == SimilarityFunction.COSINE:
return accuracy_cos
if self.main_distance_function == SimilarityFunction.MANHATTAN:
return accuracy_manhattan
if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
return accuracy_euclidean
return max(accuracy_cos, accuracy_manhattan, accuracy_euclidean)
from .SentenceEvaluator import SentenceEvaluator
from .SimilarityFunction import SimilarityFunction
from .BinaryClassificationEvaluator import BinaryClassificationEvaluator
from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
from .InformationRetrievalEvaluator import InformationRetrievalEvaluator
from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
from .MSEEvaluator import MSEEvaluator
from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
from .SequentialEvaluator import SequentialEvaluator
from .TranslationEvaluator import TranslationEvaluator
from .TripletEvaluator import TripletEvaluator
from .RerankingEvaluator import RerankingEvaluator
__all__ = [
"SentenceEvaluator",
"SimilarityFunction",
"BinaryClassificationEvaluator",
"EmbeddingSimilarityEvaluator",
"InformationRetrievalEvaluator",
"LabelAccuracyEvaluator",
"MSEEvaluator",
"MSEEvaluatorFromDataFrame",
"ParaphraseMiningEvaluator",
"SequentialEvaluator",
"TranslationEvaluator",
"TripletEvaluator",
"RerankingEvaluator",
]
import random
from typing import Any, Dict, Iterable, List, Tuple
import warnings
from torch import Tensor, nn
from torch.nn import functional as F
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import CachedMultipleNegativesRankingLoss
from sentence_transformers.models import Transformer
class TransformerDecorator:
"""
Decorator that caches the embeddings of all layers of the transformer.
When `layer_idx` is set, it returns the cached embeddings of that layer instead.
This is meant to override the forward function of the Transformer.
"""
def __init__(self, transformer: Transformer, original_forward):
self.transformer = transformer
self.original_forward = original_forward
self.embeddings: List[Tuple[Tensor]] = []
self.last_embeddings: List[Tensor] = []
self.features: List[Dict[str, Tensor]] = []
self.layer_idx = None
self.call_idx = 0
def set_layer_idx(self, layer_idx):
self.layer_idx = layer_idx
self.call_idx = 0
def get_layer_embeddings(self):
return torch.concat([embedding[self.layer_idx] for embedding in self.embeddings], dim=1)
def __call__(self, features):
if self.layer_idx is None:
output = self.call_grow_cache(features)
else:
output = self.call_use_cache(features)
self.call_idx += 1
return output
def call_grow_cache(self, features):
"""
Temporarily sets the output_hidden_states to True, runs the model, and then restores the original setting.
Use the all_layer_embeddings to get the embeddings of all layers.
"""
original_output_hidden_states = self.transformer.auto_model.config.output_hidden_states
self.transformer.auto_model.config.output_hidden_states = True
output = self.original_forward(features)
# We ignore the first layer, as it is the input embeddings
# and the last layer, as we already computed the loss over it
self.num_layers = len(output["all_layer_embeddings"]) - 1
self.embeddings.append(output["all_layer_embeddings"][1:-1])
self.last_embeddings.append(output["token_embeddings"])
self.features.append(
{key: value for key, value in output.items() if key not in ["all_layer_embeddings", "token_embeddings"]}
)
# Restore original setting
self.transformer.auto_model.config.output_hidden_states = original_output_hidden_states
if original_output_hidden_states:
del output["all_layer_embeddings"]
return output
def call_use_cache(self, features):
return {**self.features[self.call_idx], "token_embeddings": self.embeddings[self.call_idx][self.layer_idx]}
class ForwardDecorator:
"""
Decorator that caches the embeddings after all modules (e.g. pooling) of the model.
Required to get the embeddings after all modules for the KL-divergence loss.
This is meant to override the forward function of the SentenceTransformer.
"""
def __init__(self, fn):
self.fn = fn
self.embeddings = []
def __call__(self, features):
output = self.fn(features)
self.embeddings.append(output["sentence_embedding"])
return output
def get_embeddings(self):
embeddings = torch.concat(self.embeddings, dim=0)
self.embeddings = []
return embeddings
class AdaptiveLayerLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
loss: nn.Module,
n_layers_per_step: int = 1,
last_layer_weight: float = 1.0,
prior_layers_weight: float = 1.0,
kl_div_weight: float = 1.0,
kl_temperature: float = 0.3,
) -> None:
"""
The AdaptiveLayerLoss can be seen as a loss *modifier* that allows you to use other loss functions at non-final
layers of the Sentence Transformer model. This is useful for when you want to train a model where users have
the option to lower the number of layers used to improve their inference speed and memory usage.
:param model: SentenceTransformer model
:param loss: The loss function to be used, e.g. :class:`MultipleNegativesRankingLoss`, :class:`CoSENTLoss`, etc.
:param n_layers_per_step: The number of layers to use per step. If -1, then all layers are used. If > 0, then
a random sample of `n_layers_per_step` layers are used per step, separate from the final layer, which is
always used. The 2DMSE paper uses `n_layers_per_step=1`. The default value is 1.
:param last_layer_weight: The weight to use for the loss of the final layer. Increase this to focus more on the
performance when using all layers. The default value is 1.0.
:param prior_layers_weight: The weight to use for the loss of the prior layers. Increase this to focus more on
the performance when using fewer layers. The default value is 1.0.
:param kl_div_weight: The weight to use for the KL-divergence loss that is used to make the prior layers match
that of the last layer. Increase this to focus more on the performance when using fewer layers. The default
value is 1.0.
:param kl_temperature: The temperature to use for the KL-divergence loss. If 0, then the KL-divergence loss is
not used. The default value is 1.0.
References:
- The concept was inspired by the 2DMSE paper: https://arxiv.org/abs/2402.14776
- `Adaptive Layers <../../examples/training/adaptive_layer/README.html>`_
Requirements:
1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss`.
Relations:
- :class:`Matryoshka2dLoss` uses this loss in combination with :class:`MatryoshkaLoss` which allows for
output dimensionality reduction for faster downstream tasks (e.g. retrieval).
Input:
+---------------------------------------+--------+
| Texts | Labels |
+=======================================+========+
| any | any |
+---------------------------------------+--------+
Example:
::
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
model = SentenceTransformer('microsoft/mpnet-base')
train_examples = [
InputExample(texts=['Anchor 1', 'Positive 1']),
InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss = losses.AdaptiveLayerLoss(model, train_loss)
model.fit(
[(train_dataloader, train_loss)],
epochs=10,
)
"""
super().__init__()
self.model = model
self.loss = loss
self.n_layers_per_step = n_layers_per_step
self.last_layer_weight = last_layer_weight
self.prior_layers_weight = prior_layers_weight
self.kl_div_weight = kl_div_weight
self.kl_temperature = kl_temperature
assert isinstance(self.model[0], Transformer)
if isinstance(loss, CachedMultipleNegativesRankingLoss):
warnings.warn("MatryoshkaLoss is not compatible with CachedMultipleNegativesRankingLoss.", stacklevel=2)
def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor:
# Decorate the forward function of the transformer to cache the embeddings of all layers
original_transformer_forward = self.model[0].forward
transformer_decorator = TransformerDecorator(self.model[0], original_transformer_forward)
self.model[0].forward = transformer_decorator
# Decorate the forward function of the model to get the embeddings after all modules (e.g. pooling)
original_forward = self.model.forward
forward_decorator = ForwardDecorator(original_forward)
self.model.forward = forward_decorator
# Run the loss normally: i.e. the final layer, but 1) use the transformers decorator to cache
# the embeddings of all layers and 2) use the forward decorator to get the embeddings after all modules
# for the KL-divergence loss
loss = self.loss(sentence_features, labels) * self.last_layer_weight
if self.kl_temperature > 0:
final_embeddings = forward_decorator.get_embeddings()
final_embeddings = F.softmax(final_embeddings / self.kl_temperature, dim=-1)
num_layers = transformer_decorator.num_layers
layer_indices = range(num_layers - 1)
if self.n_layers_per_step > 0 and self.n_layers_per_step < num_layers - 1:
layer_indices = random.sample(layer_indices, self.n_layers_per_step)
# This loop is over `num_layer - 1` layers because we already computed the loss over the final layer
for layer_idx in layer_indices:
# Add regular loss for each layer by using the cached embeddings of that layer
transformer_decorator.set_layer_idx(layer_idx)
layer_loss = self.loss(sentence_features, labels)
loss = loss + layer_loss / (1 + layer_idx) / len(layer_indices) * self.prior_layers_weight
# and KL-divergence loss between the current layer and the final layer
# Note: we use "batchmean" reduction as that aligns with the mathematical definition
if self.kl_temperature > 0:
embeddings = forward_decorator.get_embeddings()
kl_div_loss = F.kl_div(
F.log_softmax(embeddings / self.kl_temperature, dim=-1),
final_embeddings,
reduction="batchmean",
)
loss = loss + kl_div_loss * self.kl_temperature * self.kl_div_weight
self.model[0].forward = original_transformer_forward
self.model.forward = original_forward
return loss
def get_config_dict(self) -> Dict[str, Any]:
return {
"loss": self.loss.__class__.__name__,
"n_layers_per_step": self.n_layers_per_step,
"last_layer_weight": self.last_layer_weight,
"prior_layers_weight": self.prior_layers_weight,
"kl_div_weight": self.kl_div_weight,
"kl_temperature": self.kl_temperature,
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment