first add

24db6dab · Rayyyyy · 24db6dab · 24db6dab · 24db6dab · 24db6dab
Commit 24db6dab authored Apr 12, 2024 by Rayyyyy
20 changed files
--- a/sentence_transformers/datasets/NoDuplicatesDataLoader.py
+++ b/sentence_transformers/datasets/NoDuplicatesDataLoader.py
+import random
+import math
+class NoDuplicatesDataLoader:
+    def __init__(self, train_examples, batch_size):
+        """
+        A special data loader to be used with MultipleNegativesRankingLoss.
+        The data loader ensures that there are no duplicate sentences within the same batch
+        """
+        self.batch_size = batch_size
+        self.data_pointer = 0
+        self.collate_fn = None
+        self.train_examples = train_examples
+        random.shuffle(self.train_examples)
+    def __iter__(self):
+        for _ in range(self.__len__()):
+            batch = []
+            texts_in_batch = set()
+            while len(batch) < self.batch_size:
+                example = self.train_examples[self.data_pointer]
+                valid_example = True
+                for text in example.texts:
+                    if text.strip().lower() in texts_in_batch:
+                        valid_example = False
+                        break
+                if valid_example:
+                    batch.append(example)
+                    for text in example.texts:
+                        texts_in_batch.add(text.strip().lower())
+                self.data_pointer += 1
+                if self.data_pointer >= len(self.train_examples):
+                    self.data_pointer = 0
+                    random.shuffle(self.train_examples)
+            yield self.collate_fn(batch) if self.collate_fn is not None else batch
+    def __len__(self):
+        return math.floor(len(self.train_examples) / self.batch_size)
--- a/sentence_transformers/datasets/ParallelSentencesDataset.py
+++ b/sentence_transformers/datasets/ParallelSentencesDataset.py
+from torch.utils.data import Dataset
+import logging
+import gzip
+from .. import SentenceTransformer
+from ..readers import InputExample
+from typing import List
+import random
+logger = logging.getLogger(__name__)
+class ParallelSentencesDataset(Dataset):
+    """
+    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
+    sentence in different languages. For example, the file can look like this (EN\tDE\tES):
+    hello world     hallo welt  hola mundo
+    second sentence zweiter satz    segunda oración
+    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
+    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
+    mapped to this English sentence embedding.
+    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.
+    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
+    returns a list of sentence embeddings
+    """
+    def __init__(
+        self,
+        student_model: SentenceTransformer,
+        teacher_model: SentenceTransformer,
+        batch_size: int = 8,
+        use_embedding_cache: bool = True,
+    ):
+        """
+        Parallel sentences dataset reader to train student model given a teacher model
+        :param student_model: Student sentence embedding model that should be trained
+        :param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file
+        """
+        self.student_model = student_model
+        self.teacher_model = teacher_model
+        self.datasets = []
+        self.datasets_iterator = []
+        self.datasets_tokenized = []
+        self.dataset_indices = []
+        self.copy_dataset_indices = []
+        self.cache = []
+        self.batch_size = batch_size
+        self.use_embedding_cache = use_embedding_cache
+        self.embedding_cache = {}
+        self.num_sentences = 0
+    def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128):
+        """
+        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column
+        :param filepath: Filepath to the file
+        :param weight: If more than one dataset is loaded with load_data: With which frequency should data be sampled from this dataset?
+        :param max_sentences: Max number of lines to be read from filepath
+        :param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length
+        :param batch_size: Size for encoding parallel sentences
+        :return:
+        """
+        logger.info("Load " + filepath)
+        parallel_sentences = []
+        with gzip.open(filepath, "rt", encoding="utf8") if filepath.endswith(".gz") else open(
+            filepath, encoding="utf8"
+        ) as fIn:
+            count = 0
+            for line in fIn:
+                sentences = line.strip().split("\t")
+                if (
+                    max_sentence_length is not None
+                    and max_sentence_length > 0
+                    and max([len(sent) for sent in sentences]) > max_sentence_length
+                ):
+                    continue
+                parallel_sentences.append(sentences)
+                count += 1
+                if max_sentences is not None and max_sentences > 0 and count >= max_sentences:
+                    break
+        self.add_dataset(
+            parallel_sentences, weight=weight, max_sentences=max_sentences, max_sentence_length=max_sentence_length
+        )
+    def add_dataset(
+        self,
+        parallel_sentences: List[List[str]],
+        weight: int = 100,
+        max_sentences: int = None,
+        max_sentence_length: int = 128,
+    ):
+        sentences_map = {}
+        for sentences in parallel_sentences:
+            if (
+                max_sentence_length is not None
+                and max_sentence_length > 0
+                and max([len(sent) for sent in sentences]) > max_sentence_length
+            ):
+                continue
+            source_sentence = sentences[0]
+            if source_sentence not in sentences_map:
+                sentences_map[source_sentence] = set()
+            for sent in sentences:
+                sentences_map[source_sentence].add(sent)
+            if max_sentences is not None and max_sentences > 0 and len(sentences_map) >= max_sentences:
+                break
+        if len(sentences_map) == 0:
+            return
+        self.num_sentences += sum([len(sentences_map[sent]) for sent in sentences_map])
+        dataset_id = len(self.datasets)
+        self.datasets.append(list(sentences_map.items()))
+        self.datasets_iterator.append(0)
+        self.dataset_indices.extend([dataset_id] * weight)
+    def generate_data(self):
+        source_sentences_list = []
+        target_sentences_list = []
+        for data_idx in self.dataset_indices:
+            src_sentence, trg_sentences = self.next_entry(data_idx)
+            source_sentences_list.append(src_sentence)
+            target_sentences_list.append(trg_sentences)
+        # Generate embeddings
+        src_embeddings = self.get_embeddings(source_sentences_list)
+        for src_embedding, trg_sentences in zip(src_embeddings, target_sentences_list):
+            for trg_sentence in trg_sentences:
+                self.cache.append(InputExample(texts=[trg_sentence], label=src_embedding))
+        random.shuffle(self.cache)
+    def next_entry(self, data_idx):
+        source, target_sentences = self.datasets[data_idx][self.datasets_iterator[data_idx]]
+        self.datasets_iterator[data_idx] += 1
+        if self.datasets_iterator[data_idx] >= len(self.datasets[data_idx]):  # Restart iterator
+            self.datasets_iterator[data_idx] = 0
+            random.shuffle(self.datasets[data_idx])
+        return source, target_sentences
+    def get_embeddings(self, sentences):
+        if not self.use_embedding_cache:
+            return self.teacher_model.encode(
+                sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
+            )
+        # Use caching
+        new_sentences = []
+        for sent in sentences:
+            if sent not in self.embedding_cache:
+                new_sentences.append(sent)
+        if len(new_sentences) > 0:
+            new_embeddings = self.teacher_model.encode(
+                new_sentences, batch_size=self.batch_size, show_progress_bar=False, convert_to_numpy=True
+            )
+            for sent, embedding in zip(new_sentences, new_embeddings):
+                self.embedding_cache[sent] = embedding
+        return [self.embedding_cache[sent] for sent in sentences]
+    def __len__(self):
+        return self.num_sentences
+    def __getitem__(self, idx):
+        if len(self.cache) == 0:
+            self.generate_data()
+        return self.cache.pop()
--- a/sentence_transformers/datasets/SentenceLabelDataset.py
+++ b/sentence_transformers/datasets/SentenceLabelDataset.py
+""" """
+from torch.utils.data import IterableDataset
+import numpy as np
+from typing import List
+from ..readers import InputExample
+import logging
+logger = logging.getLogger(__name__)
+class SentenceLabelDataset(IterableDataset):
+    """
+    This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires
+    multiple examples with the same label in a batch.
+    It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
+    Labels with fewer than n unique samples are ignored.
+    This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
+    This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
+    by the samples drawn per label.
+    """
+    def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False):
+        """
+        Creates a LabelSampler for a SentenceLabelDataset.
+        :param examples:
+            a list with InputExamples
+        :param samples_per_label:
+            the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
+        :param with_replacement:
+            if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
+            if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
+            drawing.
+        """
+        super().__init__()
+        self.samples_per_label = samples_per_label
+        # Group examples by label
+        label2ex = {}
+        for example in examples:
+            if example.label not in label2ex:
+                label2ex[example.label] = []
+            label2ex[example.label].append(example)
+        # Include only labels with at least 2 examples
+        self.grouped_inputs = []
+        self.groups_right_border = []
+        num_labels = 0
+        for label, label_examples in label2ex.items():
+            if len(label_examples) >= self.samples_per_label:
+                self.grouped_inputs.extend(label_examples)
+                self.groups_right_border.append(
+                    len(self.grouped_inputs)
+                )  # At which position does this label group / bucket end?
+                num_labels += 1
+        self.label_range = np.arange(num_labels)
+        self.with_replacement = with_replacement
+        np.random.shuffle(self.label_range)
+        logger.info(
+            "SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(
+                len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels
+            )
+        )
+    def __iter__(self):
+        label_idx = 0
+        count = 0
+        already_seen = {}
+        while count < len(self.grouped_inputs):
+            label = self.label_range[label_idx]
+            if label not in already_seen:
+                already_seen[label] = set()
+            left_border = 0 if label == 0 else self.groups_right_border[label - 1]
+            right_border = self.groups_right_border[label]
+            if self.with_replacement:
+                selection = np.arange(left_border, right_border)
+            else:
+                selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
+            if len(selection) >= self.samples_per_label:
+                for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
+                    count += 1
+                    already_seen[label].add(element_idx)
+                    yield self.grouped_inputs[element_idx]
+            label_idx += 1
+            if label_idx >= len(self.label_range):
+                label_idx = 0
+                already_seen = {}
+                np.random.shuffle(self.label_range)
+    def __len__(self):
+        return len(self.grouped_inputs)
--- a/sentence_transformers/datasets/SentencesDataset.py
+++ b/sentence_transformers/datasets/SentencesDataset.py
+from torch.utils.data import Dataset
+from typing import List
+from .. import SentenceTransformer
+from ..readers.InputExample import InputExample
+class SentencesDataset(Dataset):
+    """
+    DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
+    and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
+    """
+    def __init__(self, examples: List[InputExample], model: SentenceTransformer):
+        self.examples = examples
+    def __getitem__(self, item):
+        return self.examples[item]
+    def __len__(self):
+        return len(self.examples)
--- a/sentence_transformers/datasets/__init__.py
+++ b/sentence_transformers/datasets/__init__.py
+from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset
+from .NoDuplicatesDataLoader import NoDuplicatesDataLoader
+from .ParallelSentencesDataset import ParallelSentencesDataset
+from .SentencesDataset import SentencesDataset
+from .SentenceLabelDataset import SentenceLabelDataset
+__all__ = [
+    "DenoisingAutoEncoderDataset",
+    "NoDuplicatesDataLoader",
+    "ParallelSentencesDataset",
+    "SentencesDataset",
+    "SentenceLabelDataset",
+]
--- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py
+++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py
+from . import SentenceEvaluator
+import logging
+import os
+import csv
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
+from sklearn.metrics import average_precision_score
+import numpy as np
+from typing import List
+from ..readers import InputExample
+logger = logging.getLogger(__name__)
+class BinaryClassificationEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and
+    dissimilar sentences.
+    The metrics are the cosine similarity as well as euclidean and Manhattan distance
+    The returned score is the accuracy with a specified metric.
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+    The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
+    :param sentences1: The first column of sentences
+    :param sentences2: The second column of sentences
+    :param labels: labels[i] is the label for the pair (sentences1[i], sentences2[i]). Must be 0 or 1
+    :param name: Name for the output
+    :param batch_size: Batch size used to compute embeddings
+    :param show_progress_bar: If true, prints a progress bar
+    :param write_csv: Write results to a CSV file
+    """
+    def __init__(
+        self,
+        sentences1: List[str],
+        sentences2: List[str],
+        labels: List[int],
+        name: str = "",
+        batch_size: int = 32,
+        show_progress_bar: bool = False,
+        write_csv: bool = True,
+    ):
+        self.sentences1 = sentences1
+        self.sentences2 = sentences2
+        self.labels = labels
+        assert len(self.sentences1) == len(self.sentences2)
+        assert len(self.sentences1) == len(self.labels)
+        for label in labels:
+            assert label == 0 or label == 1
+        self.write_csv = write_csv
+        self.name = name
+        self.batch_size = batch_size
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        self.show_progress_bar = show_progress_bar
+        self.csv_file = "binary_classification_evaluation" + ("_" + name if name else "") + "_results.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "cossim_accuracy",
+            "cossim_accuracy_threshold",
+            "cossim_f1",
+            "cossim_precision",
+            "cossim_recall",
+            "cossim_f1_threshold",
+            "cossim_ap",
+            "manhattan_accuracy",
+            "manhattan_accuracy_threshold",
+            "manhattan_f1",
+            "manhattan_precision",
+            "manhattan_recall",
+            "manhattan_f1_threshold",
+            "manhattan_ap",
+            "euclidean_accuracy",
+            "euclidean_accuracy_threshold",
+            "euclidean_f1",
+            "euclidean_precision",
+            "euclidean_recall",
+            "euclidean_f1_threshold",
+            "euclidean_ap",
+            "dot_accuracy",
+            "dot_accuracy_threshold",
+            "dot_f1",
+            "dot_precision",
+            "dot_recall",
+            "dot_f1_threshold",
+            "dot_ap",
+        ]
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentences1 = []
+        sentences2 = []
+        scores = []
+        for example in examples:
+            sentences1.append(example.texts[0])
+            sentences2.append(example.texts[1])
+            scores.append(example.label)
+        return cls(sentences1, sentences2, scores, **kwargs)
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = f" after epoch {epoch}:"
+            else:
+                out_txt = f" in epoch {epoch} after {steps} steps:"
+        else:
+            out_txt = ":"
+        logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
+        scores = self.compute_metrices(model)
+        # Main score is the max of Average Precision (AP)
+        main_score = max(scores[short_name]["ap"] for short_name in scores)
+        file_output_data = [epoch, steps]
+        for header_name in self.csv_headers:
+            if "_" in header_name:
+                sim_fct, metric = header_name.split("_", maxsplit=1)
+                file_output_data.append(scores[sim_fct][metric])
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow(file_output_data)
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(file_output_data)
+        return main_score
+    def compute_metrices(self, model):
+        try:
+            # If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
+            sentences = list(set(self.sentences1 + self.sentences2))
+            embeddings = model.encode(
+                sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
+            )
+            emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
+            embeddings1 = [emb_dict[sent] for sent in self.sentences1]
+            embeddings2 = [emb_dict[sent] for sent in self.sentences2]
+        except TypeError:
+            # Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
+            embeddings = model.encode(
+                self.sentences1 + self.sentences2,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+            )
+            embeddings1 = embeddings[: len(self.sentences1)]
+            embeddings2 = embeddings[len(self.sentences1) :]
+        cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
+        manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
+        euclidean_distances = paired_euclidean_distances(embeddings1, embeddings2)
+        embeddings1_np = np.asarray(embeddings1)
+        embeddings2_np = np.asarray(embeddings2)
+        dot_scores = [np.dot(embeddings1_np[i], embeddings2_np[i]) for i in range(len(embeddings1_np))]
+        labels = np.asarray(self.labels)
+        output_scores = {}
+        for short_name, name, scores, reverse in [
+            ["cossim", "Cosine-Similarity", cosine_scores, True],
+            ["manhattan", "Manhattan-Distance", manhattan_distances, False],
+            ["euclidean", "Euclidean-Distance", euclidean_distances, False],
+            ["dot", "Dot-Product", dot_scores, True],
+        ]:
+            acc, acc_threshold = self.find_best_acc_and_threshold(scores, labels, reverse)
+            f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, reverse)
+            ap = average_precision_score(labels, scores * (1 if reverse else -1))
+            logger.info(
+                "Accuracy with {}:           {:.2f}\t(Threshold: {:.4f})".format(name, acc * 100, acc_threshold)
+            )
+            logger.info("F1 with {}:                 {:.2f}\t(Threshold: {:.4f})".format(name, f1 * 100, f1_threshold))
+            logger.info("Precision with {}:          {:.2f}".format(name, precision * 100))
+            logger.info("Recall with {}:             {:.2f}".format(name, recall * 100))
+            logger.info("Average Precision with {}:  {:.2f}\n".format(name, ap * 100))
+            output_scores[short_name] = {
+                "accuracy": acc,
+                "accuracy_threshold": acc_threshold,
+                "f1": f1,
+                "f1_threshold": f1_threshold,
+                "precision": precision,
+                "recall": recall,
+                "ap": ap,
+            }
+        return output_scores
+    @staticmethod
+    def find_best_acc_and_threshold(scores, labels, high_score_more_similar: bool):
+        assert len(scores) == len(labels)
+        rows = list(zip(scores, labels))
+        rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
+        max_acc = 0
+        best_threshold = -1
+        positive_so_far = 0
+        remaining_negatives = sum(labels == 0)
+        for i in range(len(rows) - 1):
+            score, label = rows[i]
+            if label == 1:
+                positive_so_far += 1
+            else:
+                remaining_negatives -= 1
+            acc = (positive_so_far + remaining_negatives) / len(labels)
+            if acc > max_acc:
+                max_acc = acc
+                best_threshold = (rows[i][0] + rows[i + 1][0]) / 2
+        return max_acc, best_threshold
+    @staticmethod
+    def find_best_f1_and_threshold(scores, labels, high_score_more_similar: bool):
+        assert len(scores) == len(labels)
+        scores = np.asarray(scores)
+        labels = np.asarray(labels)
+        rows = list(zip(scores, labels))
+        rows = sorted(rows, key=lambda x: x[0], reverse=high_score_more_similar)
+        best_f1 = best_precision = best_recall = 0
+        threshold = 0
+        nextract = 0
+        ncorrect = 0
+        total_num_duplicates = sum(labels)
+        for i in range(len(rows) - 1):
+            score, label = rows[i]
+            nextract += 1
+            if label == 1:
+                ncorrect += 1
+            if ncorrect > 0:
+                precision = ncorrect / nextract
+                recall = ncorrect / total_num_duplicates
+                f1 = 2 * precision * recall / (precision + recall)
+                if f1 > best_f1:
+                    best_f1 = f1
+                    best_precision = precision
+                    best_recall = recall
+                    threshold = (rows[i][0] + rows[i + 1][0]) / 2
+        return best_f1, best_precision, best_recall, threshold
--- a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
+++ b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
+from contextlib import nullcontext
+from . import SentenceEvaluator, SimilarityFunction
+import logging
+import os
+import csv
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
+from scipy.stats import pearsonr, spearmanr
+import numpy as np
+from typing import List, Literal, Optional
+from ..readers import InputExample
+logger = logging.getLogger(__name__)
+class EmbeddingSimilarityEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
+    in comparison to the gold standard labels.
+    The metrics are the cosine similarity as well as euclidean and Manhattan distance
+    The returned score is the Spearman correlation with a specified metric.
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+    """
+    def __init__(
+        self,
+        sentences1: List[str],
+        sentences2: List[str],
+        scores: List[float],
+        batch_size: int = 16,
+        main_similarity: SimilarityFunction = None,
+        name: str = "",
+        show_progress_bar: bool = False,
+        write_csv: bool = True,
+        precision: Optional[Literal["float32", "int8", "uint8", "binary", "ubinary"]] = None,
+        truncate_dim: Optional[int] = None,
+    ):
+        """
+        Constructs an evaluator based for the dataset
+        The labels need to indicate the similarity between the sentences.
+        :param sentences1:  List with the first sentence in a pair
+        :param sentences2: List with the second sentence in a pair
+        :param scores: Similarity score between sentences1[i] and sentences2[i]
+        :param write_csv: Write results to a CSV file
+        :param precision: The precision to use for the embeddings. Can be "float32", "int8", "uint8", "binary", or
+            "ubinary". Defaults to None.
+        :param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current
+            truncation dimension. Defaults to None.
+        """
+        self.sentences1 = sentences1
+        self.sentences2 = sentences2
+        self.scores = scores
+        self.write_csv = write_csv
+        self.precision = precision
+        self.truncate_dim = truncate_dim
+        assert len(self.sentences1) == len(self.sentences2)
+        assert len(self.sentences1) == len(self.scores)
+        self.main_similarity = main_similarity
+        self.name = name
+        self.batch_size = batch_size
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        self.show_progress_bar = show_progress_bar
+        self.csv_file = (
+            "similarity_evaluation"
+            + ("_" + name if name else "")
+            + ("_" + precision if precision else "")
+            + "_results.csv"
+        )
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "cosine_pearson",
+            "cosine_spearman",
+            "euclidean_pearson",
+            "euclidean_spearman",
+            "manhattan_pearson",
+            "manhattan_spearman",
+            "dot_pearson",
+            "dot_spearman",
+        ]
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        sentences1 = []
+        sentences2 = []
+        scores = []
+        for example in examples:
+            sentences1.append(example.texts[0])
+            sentences2.append(example.texts[1])
+            scores.append(example.label)
+        return cls(sentences1, sentences2, scores, **kwargs)
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+        with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
+            embeddings1 = model.encode(
+                self.sentences1,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+                precision=self.precision,
+                normalize_embeddings=bool(self.precision),
+            )
+            embeddings2 = model.encode(
+                self.sentences2,
+                batch_size=self.batch_size,
+                show_progress_bar=self.show_progress_bar,
+                convert_to_numpy=True,
+                precision=self.precision,
+                normalize_embeddings=bool(self.precision),
+            )
+        # Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
+        if self.precision == "binary":
+            embeddings1 = (embeddings1 + 128).astype(np.uint8)
+            embeddings2 = (embeddings2 + 128).astype(np.uint8)
+        if self.precision in ("ubinary", "binary"):
+            embeddings1 = np.unpackbits(embeddings1, axis=1)
+            embeddings2 = np.unpackbits(embeddings2, axis=1)
+        labels = self.scores
+        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
+        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
+        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
+        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
+        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
+        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
+        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
+        eval_pearson_dot, _ = pearsonr(labels, dot_products)
+        eval_spearman_dot, _ = spearmanr(labels, dot_products)
+        logger.info(
+            "Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_cosine, eval_spearman_cosine)
+        )
+        logger.info(
+            "Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
+                eval_pearson_manhattan, eval_spearman_manhattan
+            )
+        )
+        logger.info(
+            "Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
+                eval_pearson_euclidean, eval_spearman_euclidean
+            )
+        )
+        logger.info(
+            "Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson_dot, eval_spearman_dot)
+        )
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+                writer.writerow(
+                    [
+                        epoch,
+                        steps,
+                        eval_pearson_cosine,
+                        eval_spearman_cosine,
+                        eval_pearson_euclidean,
+                        eval_spearman_euclidean,
+                        eval_pearson_manhattan,
+                        eval_spearman_manhattan,
+                        eval_pearson_dot,
+                        eval_spearman_dot,
+                    ]
+                )
+        if self.main_similarity == SimilarityFunction.COSINE:
+            return eval_spearman_cosine
+        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
+            return eval_spearman_euclidean
+        elif self.main_similarity == SimilarityFunction.MANHATTAN:
+            return eval_spearman_manhattan
+        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
+            return eval_spearman_dot
+        elif self.main_similarity is None:
+            return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
+        else:
+            raise ValueError("Unknown main_similarity value")
--- a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
+++ b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
+from . import SentenceEvaluator
+import torch
+from torch import Tensor
+import logging
+from tqdm import trange
+from ..util import cos_sim, dot_score
+import os
+import numpy as np
+from typing import List, Dict, Set, Callable
+import heapq
+logger = logging.getLogger(__name__)
+class InformationRetrievalEvaluator(SentenceEvaluator):
+    """
+    This class evaluates an Information Retrieval (IR) setting.
+    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
+    Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
+    """
+    def __init__(
+        self,
+        queries: Dict[str, str],  # qid => query
+        corpus: Dict[str, str],  # cid => doc
+        relevant_docs: Dict[str, Set[str]],  # qid => Set[cid]
+        corpus_chunk_size: int = 50000,
+        mrr_at_k: List[int] = [10],
+        ndcg_at_k: List[int] = [10],
+        accuracy_at_k: List[int] = [1, 3, 5, 10],
+        precision_recall_at_k: List[int] = [1, 3, 5, 10],
+        map_at_k: List[int] = [100],
+        show_progress_bar: bool = False,
+        batch_size: int = 32,
+        name: str = "",
+        write_csv: bool = True,
+        score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = {
+            "cos_sim": cos_sim,
+            "dot_score": dot_score,
+        },  # Score function, higher=more similar
+        main_score_function: str = None,
+    ):
+        self.queries_ids = []
+        for qid in queries:
+            if qid in relevant_docs and len(relevant_docs[qid]) > 0:
+                self.queries_ids.append(qid)
+        self.queries = [queries[qid] for qid in self.queries_ids]
+        self.corpus_ids = list(corpus.keys())
+        self.corpus = [corpus[cid] for cid in self.corpus_ids]
+        self.relevant_docs = relevant_docs
+        self.corpus_chunk_size = corpus_chunk_size
+        self.mrr_at_k = mrr_at_k
+        self.ndcg_at_k = ndcg_at_k
+        self.accuracy_at_k = accuracy_at_k
+        self.precision_recall_at_k = precision_recall_at_k
+        self.map_at_k = map_at_k
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.name = name
+        self.write_csv = write_csv
+        self.score_functions = score_functions
+        self.score_function_names = sorted(list(self.score_functions.keys()))
+        self.main_score_function = main_score_function
+        if name:
+            name = "_" + name
+        self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps"]
+        for score_name in self.score_function_names:
+            for k in accuracy_at_k:
+                self.csv_headers.append("{}-Accuracy@{}".format(score_name, k))
+            for k in precision_recall_at_k:
+                self.csv_headers.append("{}-Precision@{}".format(score_name, k))
+                self.csv_headers.append("{}-Recall@{}".format(score_name, k))
+            for k in mrr_at_k:
+                self.csv_headers.append("{}-MRR@{}".format(score_name, k))
+            for k in ndcg_at_k:
+                self.csv_headers.append("{}-NDCG@{}".format(score_name, k))
+            for k in map_at_k:
+                self.csv_headers.append("{}-MAP@{}".format(score_name, k))
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
+        if epoch != -1:
+            out_txt = (
+                " after epoch {}:".format(epoch)
+                if steps == -1
+                else " in epoch {} after {} steps:".format(epoch, steps)
+            )
+        else:
+            out_txt = ":"
+        logger.info("Information Retrieval Evaluation on " + self.name + " dataset" + out_txt)
+        scores = self.compute_metrices(model, *args, **kwargs)
+        # Write results to disc
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                fOut = open(csv_path, mode="w", encoding="utf-8")
+                fOut.write(",".join(self.csv_headers))
+                fOut.write("\n")
+            else:
+                fOut = open(csv_path, mode="a", encoding="utf-8")
+            output_data = [epoch, steps]
+            for name in self.score_function_names:
+                for k in self.accuracy_at_k:
+                    output_data.append(scores[name]["accuracy@k"][k])
+                for k in self.precision_recall_at_k:
+                    output_data.append(scores[name]["precision@k"][k])
+                    output_data.append(scores[name]["recall@k"][k])
+                for k in self.mrr_at_k:
+                    output_data.append(scores[name]["mrr@k"][k])
+                for k in self.ndcg_at_k:
+                    output_data.append(scores[name]["ndcg@k"][k])
+                for k in self.map_at_k:
+                    output_data.append(scores[name]["map@k"][k])
+            fOut.write(",".join(map(str, output_data)))
+            fOut.write("\n")
+            fOut.close()
+        if self.main_score_function is None:
+            return max([scores[name]["map@k"][max(self.map_at_k)] for name in self.score_function_names])
+        else:
+            return scores[self.main_score_function]["map@k"][max(self.map_at_k)]
+    def compute_metrices(self, model, corpus_model=None, corpus_embeddings: Tensor = None) -> Dict[str, float]:
+        if corpus_model is None:
+            corpus_model = model
+        max_k = max(
+            max(self.mrr_at_k),
+            max(self.ndcg_at_k),
+            max(self.accuracy_at_k),
+            max(self.precision_recall_at_k),
+            max(self.map_at_k),
+        )
+        # Compute embedding for the queries
+        query_embeddings = model.encode(
+            self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True
+        )
+        queries_result_list = {}
+        for name in self.score_functions:
+            queries_result_list[name] = [[] for _ in range(len(query_embeddings))]
+        # Iterate over chunks of the corpus
+        for corpus_start_idx in trange(
+            0, len(self.corpus), self.corpus_chunk_size, desc="Corpus Chunks", disable=not self.show_progress_bar
+        ):
+            corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus))
+            # Encode chunk of corpus
+            if corpus_embeddings is None:
+                sub_corpus_embeddings = corpus_model.encode(
+                    self.corpus[corpus_start_idx:corpus_end_idx],
+                    show_progress_bar=False,
+                    batch_size=self.batch_size,
+                    convert_to_tensor=True,
+                )
+            else:
+                sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]
+            # Compute cosine similarites
+            for name, score_function in self.score_functions.items():
+                pair_scores = score_function(query_embeddings, sub_corpus_embeddings)
+                # Get top-k values
+                pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(
+                    pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False
+                )
+                pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
+                pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()
+                for query_itr in range(len(query_embeddings)):
+                    for sub_corpus_id, score in zip(
+                        pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]
+                    ):
+                        corpus_id = self.corpus_ids[corpus_start_idx + sub_corpus_id]
+                        if len(queries_result_list[name][query_itr]) < max_k:
+                            heapq.heappush(
+                                queries_result_list[name][query_itr], (score, corpus_id)
+                            )  # heaqp tracks the quantity of the first element in the tuple
+                        else:
+                            heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id))
+        for name in queries_result_list:
+            for query_itr in range(len(queries_result_list[name])):
+                for doc_itr in range(len(queries_result_list[name][query_itr])):
+                    score, corpus_id = queries_result_list[name][query_itr][doc_itr]
+                    queries_result_list[name][query_itr][doc_itr] = {"corpus_id": corpus_id, "score": score}
+        logger.info("Queries: {}".format(len(self.queries)))
+        logger.info("Corpus: {}\n".format(len(self.corpus)))
+        # Compute scores
+        scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions}
+        # Output
+        for name in self.score_function_names:
+            logger.info("Score-Function: {}".format(name))
+            self.output_scores(scores[name])
+        return scores
+    def compute_metrics(self, queries_result_list: List[object]):
+        # Init score computation values
+        num_hits_at_k = {k: 0 for k in self.accuracy_at_k}
+        precisions_at_k = {k: [] for k in self.precision_recall_at_k}
+        recall_at_k = {k: [] for k in self.precision_recall_at_k}
+        MRR = {k: 0 for k in self.mrr_at_k}
+        ndcg = {k: [] for k in self.ndcg_at_k}
+        AveP_at_k = {k: [] for k in self.map_at_k}
+        # Compute scores on results
+        for query_itr in range(len(queries_result_list)):
+            query_id = self.queries_ids[query_itr]
+            # Sort scores
+            top_hits = sorted(queries_result_list[query_itr], key=lambda x: x["score"], reverse=True)
+            query_relevant_docs = self.relevant_docs[query_id]
+            # Accuracy@k - We count the result correct, if at least one relevant doc is across the top-k documents
+            for k_val in self.accuracy_at_k:
+                for hit in top_hits[0:k_val]:
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_hits_at_k[k_val] += 1
+                        break
+            # Precision and Recall@k
+            for k_val in self.precision_recall_at_k:
+                num_correct = 0
+                for hit in top_hits[0:k_val]:
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_correct += 1
+                precisions_at_k[k_val].append(num_correct / k_val)
+                recall_at_k[k_val].append(num_correct / len(query_relevant_docs))
+            # MRR@k
+            for k_val in self.mrr_at_k:
+                for rank, hit in enumerate(top_hits[0:k_val]):
+                    if hit["corpus_id"] in query_relevant_docs:
+                        MRR[k_val] += 1.0 / (rank + 1)
+                        break
+            # NDCG@k
+            for k_val in self.ndcg_at_k:
+                predicted_relevance = [
+                    1 if top_hit["corpus_id"] in query_relevant_docs else 0 for top_hit in top_hits[0:k_val]
+                ]
+                true_relevances = [1] * len(query_relevant_docs)
+                ndcg_value = self.compute_dcg_at_k(predicted_relevance, k_val) / self.compute_dcg_at_k(
+                    true_relevances, k_val
+                )
+                ndcg[k_val].append(ndcg_value)
+            # MAP@k
+            for k_val in self.map_at_k:
+                num_correct = 0
+                sum_precisions = 0
+                for rank, hit in enumerate(top_hits[0:k_val]):
+                    if hit["corpus_id"] in query_relevant_docs:
+                        num_correct += 1
+                        sum_precisions += num_correct / (rank + 1)
+                avg_precision = sum_precisions / min(k_val, len(query_relevant_docs))
+                AveP_at_k[k_val].append(avg_precision)
+        # Compute averages
+        for k in num_hits_at_k:
+            num_hits_at_k[k] /= len(self.queries)
+        for k in precisions_at_k:
+            precisions_at_k[k] = np.mean(precisions_at_k[k])
+        for k in recall_at_k:
+            recall_at_k[k] = np.mean(recall_at_k[k])
+        for k in ndcg:
+            ndcg[k] = np.mean(ndcg[k])
+        for k in MRR:
+            MRR[k] /= len(self.queries)
+        for k in AveP_at_k:
+            AveP_at_k[k] = np.mean(AveP_at_k[k])
+        return {
+            "accuracy@k": num_hits_at_k,
+            "precision@k": precisions_at_k,
+            "recall@k": recall_at_k,
+            "ndcg@k": ndcg,
+            "mrr@k": MRR,
+            "map@k": AveP_at_k,
+        }
+    def output_scores(self, scores):
+        for k in scores["accuracy@k"]:
+            logger.info("Accuracy@{}: {:.2f}%".format(k, scores["accuracy@k"][k] * 100))
+        for k in scores["precision@k"]:
+            logger.info("Precision@{}: {:.2f}%".format(k, scores["precision@k"][k] * 100))
+        for k in scores["recall@k"]:
+            logger.info("Recall@{}: {:.2f}%".format(k, scores["recall@k"][k] * 100))
+        for k in scores["mrr@k"]:
+            logger.info("MRR@{}: {:.4f}".format(k, scores["mrr@k"][k]))
+        for k in scores["ndcg@k"]:
+            logger.info("NDCG@{}: {:.4f}".format(k, scores["ndcg@k"][k]))
+        for k in scores["map@k"]:
+            logger.info("MAP@{}: {:.4f}".format(k, scores["map@k"][k]))
+    @staticmethod
+    def compute_dcg_at_k(relevances, k):
+        dcg = 0
+        for i in range(min(len(relevances), k)):
+            dcg += relevances[i] / np.log2(i + 2)  # +2 as we start our idx at 0
+        return dcg
--- a/sentence_transformers/evaluation/LabelAccuracyEvaluator.py
+++ b/sentence_transformers/evaluation/LabelAccuracyEvaluator.py
+from . import SentenceEvaluator
+import torch
+from torch.utils.data import DataLoader
+import logging
+from ..util import batch_to_device
+import os
+import csv
+logger = logging.getLogger(__name__)
+class LabelAccuracyEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on its accuracy on a labeled dataset
+    This requires a model with LossFunction.SOFTMAX
+    The results are written in a CSV. If a CSV already exists, then values are appended.
+    """
+    def __init__(self, dataloader: DataLoader, name: str = "", softmax_model=None, write_csv: bool = True):
+        """
+        Constructs an evaluator for the given dataset
+        :param dataloader:
+            the data for the evaluation
+        """
+        self.dataloader = dataloader
+        self.name = name
+        self.softmax_model = softmax_model
+        if name:
+            name = "_" + name
+        self.write_csv = write_csv
+        self.csv_file = "accuracy_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "accuracy"]
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        model.eval()
+        total = 0
+        correct = 0
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        logger.info("Evaluation on the " + self.name + " dataset" + out_txt)
+        self.dataloader.collate_fn = model.smart_batching_collate
+        for step, batch in enumerate(self.dataloader):
+            features, label_ids = batch
+            for idx in range(len(features)):
+                features[idx] = batch_to_device(features[idx], model.device)
+            label_ids = label_ids.to(model.device)
+            with torch.no_grad():
+                _, prediction = self.softmax_model(features, labels=None)
+            total += prediction.size(0)
+            correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
+        accuracy = correct / total
+        logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow([epoch, steps, accuracy])
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow([epoch, steps, accuracy])
+        return accuracy
--- a/sentence_transformers/evaluation/MSEEvaluator.py
+++ b/sentence_transformers/evaluation/MSEEvaluator.py
+from sentence_transformers.evaluation import SentenceEvaluator
+import logging
+import os
+import csv
+from typing import List
+logger = logging.getLogger(__name__)
+class MSEEvaluator(SentenceEvaluator):
+    """
+    Computes the mean squared error (x100) between the computed sentence embedding
+    and some target sentence embedding.
+    The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
+    For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
+    and target_sentences are in a different language like German, Chinese, Spanish...
+    :param source_sentences: Source sentences are embedded with the teacher model
+    :param target_sentences: Target sentences are ambedding with the student model.
+    :param show_progress_bar: Show progress bar when computing embeddings
+    :param batch_size: Batch size to compute sentence embeddings
+    :param name: Name of the evaluator
+    :param write_csv: Write results to CSV file
+    """
+    def __init__(
+        self,
+        source_sentences: List[str],
+        target_sentences: List[str],
+        teacher_model=None,
+        show_progress_bar: bool = False,
+        batch_size: int = 32,
+        name: str = "",
+        write_csv: bool = True,
+    ):
+        self.source_embeddings = teacher_model.encode(
+            source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True
+        )
+        self.target_sentences = target_sentences
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.name = name
+        self.csv_file = "mse_evaluation_" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "MSE"]
+        self.write_csv = write_csv
+    def __call__(self, model, output_path, epoch=-1, steps=-1):
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        target_embeddings = model.encode(
+            self.target_sentences,
+            show_progress_bar=self.show_progress_bar,
+            batch_size=self.batch_size,
+            convert_to_numpy=True,
+        )
+        mse = ((self.source_embeddings - target_embeddings) ** 2).mean()
+        mse *= 100
+        logger.info("MSE evaluation (lower = better) on " + self.name + " dataset" + out_txt)
+        logger.info("MSE (*100):\t{:4f}".format(mse))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+                writer.writerow([epoch, steps, mse])
+        return -mse  # Return negative score as SentenceTransformers maximizes the performance
--- a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py
+++ b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py
+from sentence_transformers.evaluation import SentenceEvaluator
+from sentence_transformers import SentenceTransformer
+from typing import List, Tuple, Dict
+import numpy as np
+import logging
+import os
+import csv
+logger = logging.getLogger(__name__)
+class MSEEvaluatorFromDataFrame(SentenceEvaluator):
+    """
+    Computes the mean squared error (x100) between the computed sentence embedding and some target sentence embedding.
+    :param dataframe: It must have the following format. Rows contains different, parallel sentences.
+        Columns are the respective language codes::
+            [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
+             {'en': 'My second sentence', ...}]
+    :param combinations: Must be of the format ``[('en', 'es'), ('en', 'fr'), ...]``.
+        First entry in a tuple is the source language. The sentence in the respective language will be fetched from
+        the dataframe and passed to the teacher model. Second entry in a tuple the the target language. Sentence
+        will be fetched from the dataframe and passed to the student model
+    """
+    def __init__(
+        self,
+        dataframe: List[Dict[str, str]],
+        teacher_model: SentenceTransformer,
+        combinations: List[Tuple[str, str]],
+        batch_size: int = 8,
+        name="",
+        write_csv: bool = True,
+    ):
+        self.combinations = combinations
+        self.name = name
+        self.batch_size = batch_size
+        if name:
+            name = "_" + name
+        self.csv_file = "mse_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps"]
+        self.write_csv = write_csv
+        self.data = {}
+        logger.info("Compute teacher embeddings")
+        all_source_sentences = set()
+        for src_lang, trg_lang in self.combinations:
+            src_sentences = []
+            trg_sentences = []
+            for row in dataframe:
+                if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
+                    all_source_sentences.add(row[src_lang])
+                    src_sentences.append(row[src_lang])
+                    trg_sentences.append(row[trg_lang])
+            self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
+            self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
+        all_source_sentences = list(all_source_sentences)
+        all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
+        self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1):
+        model.eval()
+        mse_scores = []
+        for src_lang, trg_lang in self.combinations:
+            src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
+            src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
+            trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
+            mse = ((src_embeddings - trg_embeddings) ** 2).mean()
+            mse *= 100
+            mse_scores.append(mse)
+            logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
+            logger.info("MSE (*100):\t{:4f}".format(mse))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+                writer.writerow([epoch, steps] + mse_scores)
+        return -np.mean(mse_scores)  # Return negative score as SentenceTransformers maximizes the performance
--- a/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py
+++ b/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py
+from . import SentenceEvaluator
+import logging
+from sentence_transformers.util import paraphrase_mining
+import os
+import csv
+from typing import List, Tuple, Dict
+from collections import defaultdict
+logger = logging.getLogger(__name__)
+class ParaphraseMiningEvaluator(SentenceEvaluator):
+    """
+    Given a large set of sentences, this evaluator performs paraphrase (duplicate) mining and
+    identifies the pairs with the highest similarity. It compare the extracted paraphrase pairs
+    with a set of gold labels and computes the F1 score.
+    """
+    def __init__(
+        self,
+        sentences_map: Dict[str, str],
+        duplicates_list: List[Tuple[str, str]] = None,
+        duplicates_dict: Dict[str, Dict[str, bool]] = None,
+        add_transitive_closure: bool = False,
+        query_chunk_size: int = 5000,
+        corpus_chunk_size: int = 100000,
+        max_pairs: int = 500000,
+        top_k: int = 100,
+        show_progress_bar: bool = False,
+        batch_size: int = 16,
+        name: str = "",
+        write_csv: bool = True,
+    ):
+        """
+        :param sentences_map: A dictionary that maps sentence-ids to sentences, i.e. sentences_map[id] => sentence.
+        :param duplicates_list: Duplicates_list is a list with id pairs [(id1, id2), (id1, id5)] that identifies the duplicates / paraphrases in the sentences_map
+        :param duplicates_dict: A default dictionary mapping [id1][id2] to true if id1 and id2 are duplicates. Must be symmetric, i.e., if [id1][id2] => True, then [id2][id1] => True.
+        :param add_transitive_closure: If true, it adds a transitive closure, i.e. if dup[a][b] and dup[b][c], then dup[a][c]
+        :param query_chunk_size: To identify the paraphrases, the cosine-similarity between all sentence-pairs will be computed. As this might require a lot of memory, we perform a batched computation.  #query_batch_size sentences will be compared against up to #corpus_batch_size sentences. In the default setting, 5000 sentences will be grouped together and compared up-to against 100k other sentences.
+        :param corpus_chunk_size: The corpus will be batched, to reduce the memory requirement
+        :param max_pairs: We will only extract up to #max_pairs potential paraphrase candidates.
+        :param top_k: For each query, we extract the top_k most similar pairs and add it to a sorted list. I.e., for one sentence we cannot find more than top_k paraphrases
+        :param show_progress_bar: Output a progress bar
+        :param batch_size: Batch size for computing sentence embeddings
+        :param name: Name of the experiment
+        :param write_csv: Write results to CSV file
+        """
+        self.sentences = []
+        self.ids = []
+        for id, sentence in sentences_map.items():
+            self.sentences.append(sentence)
+            self.ids.append(id)
+        self.name = name
+        self.show_progress_bar = show_progress_bar
+        self.batch_size = batch_size
+        self.query_chunk_size = query_chunk_size
+        self.corpus_chunk_size = corpus_chunk_size
+        self.max_pairs = max_pairs
+        self.top_k = top_k
+        self.duplicates = duplicates_dict if duplicates_dict is not None else defaultdict(lambda: defaultdict(bool))
+        if duplicates_list is not None:
+            for id1, id2 in duplicates_list:
+                if id1 in sentences_map and id2 in sentences_map:
+                    self.duplicates[id1][id2] = True
+                    self.duplicates[id2][id1] = True
+        # Add transitive closure
+        if add_transitive_closure:
+            self.duplicates = self.add_transitive_closure(self.duplicates)
+        positive_key_pairs = set()
+        for key1 in self.duplicates:
+            for key2 in self.duplicates[key1]:
+                if (
+                    key1 in sentences_map
+                    and key2 in sentences_map
+                    and (self.duplicates[key1][key2] or self.duplicates[key2][key1])
+                ):
+                    positive_key_pairs.add(tuple(sorted([key1, key2])))
+        self.total_num_duplicates = len(positive_key_pairs)
+        if name:
+            name = "_" + name
+        self.csv_file: str = "paraphrase_mining_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "precision", "recall", "f1", "threshold", "average_precision"]
+        self.write_csv = write_csv
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            out_txt = f" after epoch {epoch}:" if steps == -1 else f" in epoch {epoch} after {steps} steps:"
+        else:
+            out_txt = ":"
+        logger.info("Paraphrase Mining Evaluation on " + self.name + " dataset" + out_txt)
+        # Compute embedding for the sentences
+        pairs_list = paraphrase_mining(
+            model,
+            self.sentences,
+            self.show_progress_bar,
+            self.batch_size,
+            self.query_chunk_size,
+            self.corpus_chunk_size,
+            self.max_pairs,
+            self.top_k,
+        )
+        logger.info("Number of candidate pairs: " + str(len(pairs_list)))
+        # Compute F1 score and Average Precision
+        n_extract = n_correct = 0
+        threshold = 0
+        best_f1 = best_recall = best_precision = 0
+        average_precision = 0
+        for idx in range(len(pairs_list)):
+            score, i, j = pairs_list[idx]
+            id1 = self.ids[i]
+            id2 = self.ids[j]
+            # Compute optimal threshold and F1-score
+            n_extract += 1
+            if self.duplicates[id1][id2] or self.duplicates[id2][id1]:
+                n_correct += 1
+                precision = n_correct / n_extract
+                recall = n_correct / self.total_num_duplicates
+                f1 = 2 * precision * recall / (precision + recall)
+                average_precision += precision
+                if f1 > best_f1:
+                    best_f1 = f1
+                    best_precision = precision
+                    best_recall = recall
+                    threshold = (pairs_list[idx][0] + pairs_list[min(idx + 1, len(pairs_list) - 1)][0]) / 2
+        average_precision = average_precision / self.total_num_duplicates
+        logger.info("Average Precision: {:.2f}".format(average_precision * 100))
+        logger.info("Optimal threshold: {:.4f}".format(threshold))
+        logger.info("Precision: {:.2f}".format(best_precision * 100))
+        logger.info("Recall: {:.2f}".format(best_recall * 100))
+        logger.info("F1: {:.2f}\n".format(best_f1 * 100))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow([epoch, steps, best_precision, best_recall, best_f1, threshold, average_precision])
+        return average_precision
+    @staticmethod
+    def add_transitive_closure(graph):
+        nodes_visited = set()
+        for a in list(graph.keys()):
+            if a not in nodes_visited:
+                connected_subgraph_nodes = set()
+                connected_subgraph_nodes.add(a)
+                # Add all nodes in the connected graph
+                neighbor_nodes_queue = list(graph[a])
+                while len(neighbor_nodes_queue) > 0:
+                    node = neighbor_nodes_queue.pop(0)
+                    if node not in connected_subgraph_nodes:
+                        connected_subgraph_nodes.add(node)
+                        neighbor_nodes_queue.extend(graph[node])
+                # Ensure transitivity between all nodes in the graph
+                connected_subgraph_nodes = list(connected_subgraph_nodes)
+                for i in range(len(connected_subgraph_nodes) - 1):
+                    for j in range(i + 1, len(connected_subgraph_nodes)):
+                        graph[connected_subgraph_nodes[i]][connected_subgraph_nodes[j]] = True
+                        graph[connected_subgraph_nodes[j]][connected_subgraph_nodes[i]] = True
+                        nodes_visited.add(connected_subgraph_nodes[i])
+                        nodes_visited.add(connected_subgraph_nodes[j])
+        return graph
--- a/sentence_transformers/evaluation/RerankingEvaluator.py
+++ b/sentence_transformers/evaluation/RerankingEvaluator.py
+from . import SentenceEvaluator
+import logging
+import numpy as np
+import os
+import csv
+from ..util import cos_sim
+import torch
+from sklearn.metrics import average_precision_score, ndcg_score
+import tqdm
+from typing import Optional
+logger = logging.getLogger(__name__)
+class RerankingEvaluator(SentenceEvaluator):
+    """
+    This class evaluates a SentenceTransformer model for the task of re-ranking.
+    Given a query and a list of documents, it computes the score [query, doc_i] for all possible
+    documents and sorts them in decreasing order. Then, MRR@10, NDCG@10 and MAP is compute to measure the quality of the ranking.
+    :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
+     positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
+    """
+    def __init__(
+        self,
+        samples,
+        at_k: int = 10,
+        name: str = "",
+        write_csv: bool = True,
+        similarity_fct=cos_sim,
+        batch_size: int = 64,
+        show_progress_bar: bool = False,
+        use_batched_encoding: bool = True,
+        mrr_at_k: Optional[int] = None,
+    ):
+        self.samples = samples
+        self.name = name
+        if mrr_at_k is not None:
+            logger.warning(f"The `mrr_at_k` parameter has been deprecated; please use `at_k={mrr_at_k}` instead.")
+            self.at_k = mrr_at_k
+        else:
+            self.at_k = at_k
+        self.similarity_fct = similarity_fct
+        self.batch_size = batch_size
+        self.show_progress_bar = show_progress_bar
+        self.use_batched_encoding = use_batched_encoding
+        if isinstance(self.samples, dict):
+            self.samples = list(self.samples.values())
+        ### Remove sample with empty positive / negative set
+        self.samples = [
+            sample for sample in self.samples if len(sample["positive"]) > 0 and len(sample["negative"]) > 0
+        ]
+        self.csv_file = "RerankingEvaluator" + ("_" + name if name else "") + f"_results_@{self.at_k}.csv"
+        self.csv_headers = [
+            "epoch",
+            "steps",
+            "MAP",
+            "MRR@{}".format(self.at_k),
+            "NDCG@{}".format(self.at_k),
+        ]
+        self.write_csv = write_csv
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+        scores = self.compute_metrices(model)
+        mean_ap = scores["map"]
+        mean_mrr = scores["mrr"]
+        mean_ndcg = scores["ndcg"]
+        #### Some stats about the dataset
+        num_positives = [len(sample["positive"]) for sample in self.samples]
+        num_negatives = [len(sample["negative"]) for sample in self.samples]
+        logger.info(
+            "Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(
+                len(self.samples),
+                np.min(num_positives),
+                np.mean(num_positives),
+                np.max(num_positives),
+                np.min(num_negatives),
+                np.mean(num_negatives),
+                np.max(num_negatives),
+            )
+        )
+        logger.info("MAP: {:.2f}".format(mean_ap * 100))
+        logger.info("MRR@{}: {:.2f}".format(self.at_k, mean_mrr * 100))
+        logger.info("NDCG@{}: {:.2f}".format(self.at_k, mean_ndcg * 100))
+        #### Write results to disc
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+                writer.writerow([epoch, steps, mean_ap, mean_mrr, mean_ndcg])
+        return mean_ap
+    def compute_metrices(self, model):
+        return (
+            self.compute_metrices_batched(model)
+            if self.use_batched_encoding
+            else self.compute_metrices_individual(model)
+        )
+    def compute_metrices_batched(self, model):
+        """
+        Computes the metrices in a batched way, by batching all queries and
+        all documents together
+        """
+        all_mrr_scores = []
+        all_ndcg_scores = []
+        all_ap_scores = []
+        all_query_embs = model.encode(
+            [sample["query"] for sample in self.samples],
+            convert_to_tensor=True,
+            batch_size=self.batch_size,
+            show_progress_bar=self.show_progress_bar,
+        )
+        all_docs = []
+        for sample in self.samples:
+            all_docs.extend(sample["positive"])
+            all_docs.extend(sample["negative"])
+        all_docs_embs = model.encode(
+            all_docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar
+        )
+        # Compute scores
+        query_idx, docs_idx = 0, 0
+        for instance in self.samples:
+            query_emb = all_query_embs[query_idx]
+            query_idx += 1
+            num_pos = len(instance["positive"])
+            num_neg = len(instance["negative"])
+            docs_emb = all_docs_embs[docs_idx : docs_idx + num_pos + num_neg]
+            docs_idx += num_pos + num_neg
+            if num_pos == 0 or num_neg == 0:
+                continue
+            pred_scores = self.similarity_fct(query_emb, docs_emb)
+            if len(pred_scores.shape) > 1:
+                pred_scores = pred_scores[0]
+            pred_scores_argsort = torch.argsort(-pred_scores)  # Sort in decreasing order
+            pred_scores = pred_scores.cpu().tolist()
+            # Compute MRR score
+            is_relevant = [1] * num_pos + [0] * num_neg
+            mrr_score = 0
+            for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
+                if is_relevant[index]:
+                    mrr_score = 1 / (rank + 1)
+                    break
+            all_mrr_scores.append(mrr_score)
+            # Compute NDCG score
+            all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
+            # Compute AP
+            all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
+        mean_ap = np.mean(all_ap_scores)
+        mean_mrr = np.mean(all_mrr_scores)
+        mean_ndcg = np.mean(all_ndcg_scores)
+        return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
+    def compute_metrices_individual(self, model):
+        """
+        Embeds every (query, positive, negative) tuple individually.
+        Is slower than the batched version, but saves memory as only the
+        embeddings for one tuple are needed. Useful when you have
+        a really large test set
+        """
+        all_mrr_scores = []
+        all_ndcg_scores = []
+        all_ap_scores = []
+        for instance in tqdm.tqdm(self.samples, disable=not self.show_progress_bar, desc="Samples"):
+            query = instance["query"]
+            positive = list(instance["positive"])
+            negative = list(instance["negative"])
+            if len(positive) == 0 or len(negative) == 0:
+                continue
+            docs = positive + negative
+            is_relevant = [1] * len(positive) + [0] * len(negative)
+            query_emb = model.encode(
+                [query], convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False
+            )
+            docs_emb = model.encode(docs, convert_to_tensor=True, batch_size=self.batch_size, show_progress_bar=False)
+            pred_scores = self.similarity_fct(query_emb, docs_emb)
+            if len(pred_scores.shape) > 1:
+                pred_scores = pred_scores[0]
+            pred_scores_argsort = torch.argsort(-pred_scores)  # Sort in decreasing order
+            pred_scores = pred_scores.cpu().tolist()
+            # Compute MRR score
+            mrr_score = 0
+            for rank, index in enumerate(pred_scores_argsort[0 : self.at_k]):
+                if is_relevant[index]:
+                    mrr_score = 1 / (rank + 1)
+                    break
+            all_mrr_scores.append(mrr_score)
+            # Compute NDCG score
+            all_ndcg_scores.append(ndcg_score([is_relevant], [pred_scores], k=self.at_k))
+            # Compute AP
+            all_ap_scores.append(average_precision_score(is_relevant, pred_scores))
+        mean_ap = np.mean(all_ap_scores)
+        mean_mrr = np.mean(all_mrr_scores)
+        mean_ndcg = np.mean(all_ndcg_scores)
+        return {"map": mean_ap, "mrr": mean_mrr, "ndcg": mean_ndcg}
--- a/sentence_transformers/evaluation/SentenceEvaluator.py
+++ b/sentence_transformers/evaluation/SentenceEvaluator.py
+class SentenceEvaluator:
+    """
+    Base class for all evaluators
+    Extend this class and implement __call__ for custom evaluators.
+    """
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        """
+        This is called during training to evaluate the model.
+        It returns a score for the evaluation with a higher score indicating a better result.
+        :param model:
+            the model to evaluate
+        :param output_path:
+            path where predictions and metrics are written to
+        :param epoch
+            the epoch where the evaluation takes place.
+            This is used for the file prefixes.
+            If this is -1, then we assume evaluation on test data.
+        :param steps
+            the steps in the current epoch at time of the evaluation.
+            This is used for the file prefixes.
+            If this is -1, then we assume evaluation at the end of the epoch.
+        :return: a score for the evaluation with a higher score indicating a better result
+        """
+        pass
--- a/sentence_transformers/evaluation/SequentialEvaluator.py
+++ b/sentence_transformers/evaluation/SequentialEvaluator.py
+from . import SentenceEvaluator
+from typing import Iterable
+class SequentialEvaluator(SentenceEvaluator):
+    """
+    This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
+    the data is passed sequentially to all sub-evaluators.
+    All scores are passed to 'main_score_function', which derives one final score value
+    """
+    def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function=lambda scores: scores[-1]):
+        self.evaluators = evaluators
+        self.main_score_function = main_score_function
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        scores = []
+        for evaluator in self.evaluators:
+            scores.append(evaluator(model, output_path, epoch, steps))
+        return self.main_score_function(scores)
--- a/sentence_transformers/evaluation/SimilarityFunction.py
+++ b/sentence_transformers/evaluation/SimilarityFunction.py
+from enum import Enum
+class SimilarityFunction(Enum):
+    COSINE = 0
+    EUCLIDEAN = 1
+    MANHATTAN = 2
+    DOT_PRODUCT = 3
--- a/sentence_transformers/evaluation/TranslationEvaluator.py
+++ b/sentence_transformers/evaluation/TranslationEvaluator.py
+from . import SentenceEvaluator
+import logging
+from ..util import pytorch_cos_sim
+import os
+import csv
+import numpy as np
+from typing import List
+import torch
+logger = logging.getLogger(__name__)
+class TranslationEvaluator(SentenceEvaluator):
+    """
+    Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...),
+    and assuming that fr_i is the translation of en_i.
+    Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accuracy in both directions
+    """
+    def __init__(
+        self,
+        source_sentences: List[str],
+        target_sentences: List[str],
+        show_progress_bar: bool = False,
+        batch_size: int = 16,
+        name: str = "",
+        print_wrong_matches: bool = False,
+        write_csv: bool = True,
+    ):
+        """
+        Constructs an evaluator based for the dataset
+        The labels need to indicate the similarity between the sentences.
+        :param source_sentences:
+            List of sentences in source language
+        :param target_sentences:
+            List of sentences in target language
+        :param print_wrong_matches:
+            Prints incorrect matches
+        :param write_csv:
+            Write results to CSV file
+        """
+        self.source_sentences = source_sentences
+        self.target_sentences = target_sentences
+        self.name = name
+        self.batch_size = batch_size
+        self.show_progress_bar = show_progress_bar
+        self.print_wrong_matches = print_wrong_matches
+        assert len(self.source_sentences) == len(self.target_sentences)
+        if name:
+            name = "_" + name
+        self.csv_file = "translation_evaluation" + name + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"]
+        self.write_csv = write_csv
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        logger.info("Evaluating translation matching Accuracy on " + self.name + " dataset" + out_txt)
+        embeddings1 = torch.stack(
+            model.encode(
+                self.source_sentences,
+                show_progress_bar=self.show_progress_bar,
+                batch_size=self.batch_size,
+                convert_to_numpy=False,
+            )
+        )
+        embeddings2 = torch.stack(
+            model.encode(
+                self.target_sentences,
+                show_progress_bar=self.show_progress_bar,
+                batch_size=self.batch_size,
+                convert_to_numpy=False,
+            )
+        )
+        cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy()
+        correct_src2trg = 0
+        correct_trg2src = 0
+        for i in range(len(cos_sims)):
+            max_idx = np.argmax(cos_sims[i])
+            if i == max_idx:
+                correct_src2trg += 1
+            elif self.print_wrong_matches:
+                print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT")
+                print("Src:", self.source_sentences[i])
+                print("Trg:", self.target_sentences[max_idx])
+                print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i])
+                results = zip(range(len(cos_sims[i])), cos_sims[i])
+                results = sorted(results, key=lambda x: x[1], reverse=True)
+                for idx, score in results[0:5]:
+                    print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx])
+        cos_sims = cos_sims.T
+        for i in range(len(cos_sims)):
+            max_idx = np.argmax(cos_sims[i])
+            if i == max_idx:
+                correct_trg2src += 1
+        acc_src2trg = correct_src2trg / len(cos_sims)
+        acc_trg2src = correct_trg2src / len(cos_sims)
+        logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg * 100))
+        logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src * 100))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            output_file_exists = os.path.isfile(csv_path)
+            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
+                writer = csv.writer(f)
+                if not output_file_exists:
+                    writer.writerow(self.csv_headers)
+                writer.writerow([epoch, steps, acc_src2trg, acc_trg2src])
+        return (acc_src2trg + acc_trg2src) / 2
--- a/sentence_transformers/evaluation/TripletEvaluator.py
+++ b/sentence_transformers/evaluation/TripletEvaluator.py
+from . import SentenceEvaluator, SimilarityFunction
+import logging
+import os
+import csv
+from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
+from typing import List
+from ..readers import InputExample
+logger = logging.getLogger(__name__)
+class TripletEvaluator(SentenceEvaluator):
+    """
+    Evaluate a model based on a triplet: (sentence, positive_example, negative_example).
+        Checks if distance(sentence, positive_example) < distance(sentence, negative_example).
+    """
+    def __init__(
+        self,
+        anchors: List[str],
+        positives: List[str],
+        negatives: List[str],
+        main_distance_function: SimilarityFunction = None,
+        name: str = "",
+        batch_size: int = 16,
+        show_progress_bar: bool = False,
+        write_csv: bool = True,
+    ):
+        """
+        :param anchors: Sentences to check similarity to. (e.g. a query)
+        :param positives: List of positive sentences
+        :param negatives: List of negative sentences
+        :param main_distance_function: One of 0 (Cosine), 1 (Euclidean) or 2 (Manhattan). Defaults to None, returning all 3.
+        :param name: Name for the output
+        :param batch_size: Batch size used to compute embeddings
+        :param show_progress_bar: If true, prints a progress bar
+        :param write_csv: Write results to a CSV file
+        """
+        self.anchors = anchors
+        self.positives = positives
+        self.negatives = negatives
+        self.name = name
+        assert len(self.anchors) == len(self.positives)
+        assert len(self.anchors) == len(self.negatives)
+        self.main_distance_function = main_distance_function
+        self.batch_size = batch_size
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        self.show_progress_bar = show_progress_bar
+        self.csv_file: str = "triplet_evaluation" + ("_" + name if name else "") + "_results.csv"
+        self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhattan", "accuracy_euclidean"]
+        self.write_csv = write_csv
+    @classmethod
+    def from_input_examples(cls, examples: List[InputExample], **kwargs):
+        anchors = []
+        positives = []
+        negatives = []
+        for example in examples:
+            anchors.append(example.texts[0])
+            positives.append(example.texts[1])
+            negatives.append(example.texts[2])
+        return cls(anchors, positives, negatives, **kwargs)
+    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
+        if epoch != -1:
+            if steps == -1:
+                out_txt = " after epoch {}:".format(epoch)
+            else:
+                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
+        else:
+            out_txt = ":"
+        logger.info("TripletEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
+        num_triplets = 0
+        num_correct_cos_triplets, num_correct_manhattan_triplets, num_correct_euclidean_triplets = 0, 0, 0
+        embeddings_anchors = model.encode(
+            self.anchors, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
+        )
+        embeddings_positives = model.encode(
+            self.positives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
+        )
+        embeddings_negatives = model.encode(
+            self.negatives, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
+        )
+        # Cosine distance
+        pos_cos_distance = paired_cosine_distances(embeddings_anchors, embeddings_positives)
+        neg_cos_distances = paired_cosine_distances(embeddings_anchors, embeddings_negatives)
+        # Manhattan
+        pos_manhattan_distance = paired_manhattan_distances(embeddings_anchors, embeddings_positives)
+        neg_manhattan_distances = paired_manhattan_distances(embeddings_anchors, embeddings_negatives)
+        # Euclidean
+        pos_euclidean_distance = paired_euclidean_distances(embeddings_anchors, embeddings_positives)
+        neg_euclidean_distances = paired_euclidean_distances(embeddings_anchors, embeddings_negatives)
+        for idx in range(len(pos_cos_distance)):
+            num_triplets += 1
+            if pos_cos_distance[idx] < neg_cos_distances[idx]:
+                num_correct_cos_triplets += 1
+            if pos_manhattan_distance[idx] < neg_manhattan_distances[idx]:
+                num_correct_manhattan_triplets += 1
+            if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
+                num_correct_euclidean_triplets += 1
+        accuracy_cos = num_correct_cos_triplets / num_triplets
+        accuracy_manhattan = num_correct_manhattan_triplets / num_triplets
+        accuracy_euclidean = num_correct_euclidean_triplets / num_triplets
+        logger.info("Accuracy Cosine Distance:   \t{:.2f}".format(accuracy_cos * 100))
+        logger.info("Accuracy Manhattan Distance:\t{:.2f}".format(accuracy_manhattan * 100))
+        logger.info("Accuracy Euclidean Distance:\t{:.2f}\n".format(accuracy_euclidean * 100))
+        if output_path is not None and self.write_csv:
+            csv_path = os.path.join(output_path, self.csv_file)
+            if not os.path.isfile(csv_path):
+                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(self.csv_headers)
+                    writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
+            else:
+                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    writer.writerow([epoch, steps, accuracy_cos, accuracy_manhattan, accuracy_euclidean])
+        if self.main_distance_function == SimilarityFunction.COSINE:
+            return accuracy_cos
+        if self.main_distance_function == SimilarityFunction.MANHATTAN:
+            return accuracy_manhattan
+        if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
+            return accuracy_euclidean
+        return max(accuracy_cos, accuracy_manhattan, accuracy_euclidean)
--- a/sentence_transformers/evaluation/__init__.py
+++ b/sentence_transformers/evaluation/__init__.py
+from .SentenceEvaluator import SentenceEvaluator
+from .SimilarityFunction import SimilarityFunction
+from .BinaryClassificationEvaluator import BinaryClassificationEvaluator
+from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
+from .InformationRetrievalEvaluator import InformationRetrievalEvaluator
+from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
+from .MSEEvaluator import MSEEvaluator
+from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
+from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
+from .SequentialEvaluator import SequentialEvaluator
+from .TranslationEvaluator import TranslationEvaluator
+from .TripletEvaluator import TripletEvaluator
+from .RerankingEvaluator import RerankingEvaluator
+__all__ = [
+    "SentenceEvaluator",
+    "SimilarityFunction",
+    "BinaryClassificationEvaluator",
+    "EmbeddingSimilarityEvaluator",
+    "InformationRetrievalEvaluator",
+    "LabelAccuracyEvaluator",
+    "MSEEvaluator",
+    "MSEEvaluatorFromDataFrame",
+    "ParaphraseMiningEvaluator",
+    "SequentialEvaluator",
+    "TranslationEvaluator",
+    "TripletEvaluator",
+    "RerankingEvaluator",
+]
--- a/sentence_transformers/losses/AdaptiveLayerLoss.py
+++ b/sentence_transformers/losses/AdaptiveLayerLoss.py
+import random
+from typing import Any, Dict, Iterable, List, Tuple
+import warnings
+from torch import Tensor, nn
+from torch.nn import functional as F
+import torch
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.losses.CachedMultipleNegativesRankingLoss import CachedMultipleNegativesRankingLoss
+from sentence_transformers.models import Transformer
+class TransformerDecorator:
+    """
+    Decorator that caches the embeddings of all layers of the transformer.
+    When `layer_idx` is set, it returns the cached embeddings of that layer instead.
+    This is meant to override the forward function of the Transformer.
+    """
+    def __init__(self, transformer: Transformer, original_forward):
+        self.transformer = transformer
+        self.original_forward = original_forward
+        self.embeddings: List[Tuple[Tensor]] = []
+        self.last_embeddings: List[Tensor] = []
+        self.features: List[Dict[str, Tensor]] = []
+        self.layer_idx = None
+        self.call_idx = 0
+    def set_layer_idx(self, layer_idx):
+        self.layer_idx = layer_idx
+        self.call_idx = 0
+    def get_layer_embeddings(self):
+        return torch.concat([embedding[self.layer_idx] for embedding in self.embeddings], dim=1)
+    def __call__(self, features):
+        if self.layer_idx is None:
+            output = self.call_grow_cache(features)
+        else:
+            output = self.call_use_cache(features)
+            self.call_idx += 1
+        return output
+    def call_grow_cache(self, features):
+        """
+        Temporarily sets the output_hidden_states to True, runs the model, and then restores the original setting.
+        Use the all_layer_embeddings to get the embeddings of all layers.
+        """
+        original_output_hidden_states = self.transformer.auto_model.config.output_hidden_states
+        self.transformer.auto_model.config.output_hidden_states = True
+        output = self.original_forward(features)
+        # We ignore the first layer, as it is the input embeddings
+        # and the last layer, as we already computed the loss over it
+        self.num_layers = len(output["all_layer_embeddings"]) - 1
+        self.embeddings.append(output["all_layer_embeddings"][1:-1])
+        self.last_embeddings.append(output["token_embeddings"])
+        self.features.append(
+            {key: value for key, value in output.items() if key not in ["all_layer_embeddings", "token_embeddings"]}
+        )
+        # Restore original setting
+        self.transformer.auto_model.config.output_hidden_states = original_output_hidden_states
+        if original_output_hidden_states:
+            del output["all_layer_embeddings"]
+        return output
+    def call_use_cache(self, features):
+        return {**self.features[self.call_idx], "token_embeddings": self.embeddings[self.call_idx][self.layer_idx]}
+class ForwardDecorator:
+    """
+    Decorator that caches the embeddings after all modules (e.g. pooling) of the model.
+    Required to get the embeddings after all modules for the KL-divergence loss.
+    This is meant to override the forward function of the SentenceTransformer.
+    """
+    def __init__(self, fn):
+        self.fn = fn
+        self.embeddings = []
+    def __call__(self, features):
+        output = self.fn(features)
+        self.embeddings.append(output["sentence_embedding"])
+        return output
+    def get_embeddings(self):
+        embeddings = torch.concat(self.embeddings, dim=0)
+        self.embeddings = []
+        return embeddings
+class AdaptiveLayerLoss(nn.Module):
+    def __init__(
+        self,
+        model: SentenceTransformer,
+        loss: nn.Module,
+        n_layers_per_step: int = 1,
+        last_layer_weight: float = 1.0,
+        prior_layers_weight: float = 1.0,
+        kl_div_weight: float = 1.0,
+        kl_temperature: float = 0.3,
+    ) -> None:
+        """
+        The AdaptiveLayerLoss can be seen as a loss *modifier* that allows you to use other loss functions at non-final
+        layers of the Sentence Transformer model. This is useful for when you want to train a model where users have
+        the option to lower the number of layers used to improve their inference speed and memory usage.
+        :param model: SentenceTransformer model
+        :param loss: The loss function to be used, e.g. :class:`MultipleNegativesRankingLoss`, :class:`CoSENTLoss`, etc.
+        :param n_layers_per_step: The number of layers to use per step. If -1, then all layers are used. If > 0, then
+            a random sample of `n_layers_per_step` layers are used per step, separate from the final layer, which is
+            always used. The 2DMSE paper uses `n_layers_per_step=1`. The default value is 1.
+        :param last_layer_weight: The weight to use for the loss of the final layer. Increase this to focus more on the
+            performance when using all layers. The default value is 1.0.
+        :param prior_layers_weight: The weight to use for the loss of the prior layers. Increase this to focus more on
+            the performance when using fewer layers. The default value is 1.0.
+        :param kl_div_weight: The weight to use for the KL-divergence loss that is used to make the prior layers match
+            that of the last layer. Increase this to focus more on the performance when using fewer layers. The default
+            value is 1.0.
+        :param kl_temperature: The temperature to use for the KL-divergence loss. If 0, then the KL-divergence loss is
+            not used. The default value is 1.0.
+        References:
+            - The concept was inspired by the 2DMSE paper: https://arxiv.org/abs/2402.14776
+            - `Adaptive Layers <../../examples/training/adaptive_layer/README.html>`_
+        Requirements:
+            1. The base loss cannot be :class:`CachedMultipleNegativesRankingLoss`.
+        Relations:
+            - :class:`Matryoshka2dLoss` uses this loss in combination with :class:`MatryoshkaLoss` which allows for
+                output dimensionality reduction for faster downstream tasks (e.g. retrieval).
+        Input:
+            +---------------------------------------+--------+
+            | Texts                                 | Labels |
+            +=======================================+========+
+            | any                                   | any    |
+            +---------------------------------------+--------+
+        Example:
+            ::
+                from sentence_transformers import SentenceTransformer, losses, InputExample
+                from torch.utils.data import DataLoader
+                model = SentenceTransformer('microsoft/mpnet-base')
+                train_examples = [
+                    InputExample(texts=['Anchor 1', 'Positive 1']),
+                    InputExample(texts=['Anchor 2', 'Positive 2']),
+                ]
+                train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
+                train_loss = losses.MultipleNegativesRankingLoss(model=model)
+                train_loss = losses.AdaptiveLayerLoss(model, train_loss)
+                model.fit(
+                    [(train_dataloader, train_loss)],
+                    epochs=10,
+                )
+        """
+        super().__init__()
+        self.model = model
+        self.loss = loss
+        self.n_layers_per_step = n_layers_per_step
+        self.last_layer_weight = last_layer_weight
+        self.prior_layers_weight = prior_layers_weight
+        self.kl_div_weight = kl_div_weight
+        self.kl_temperature = kl_temperature
+        assert isinstance(self.model[0], Transformer)
+        if isinstance(loss, CachedMultipleNegativesRankingLoss):
+            warnings.warn("MatryoshkaLoss is not compatible with CachedMultipleNegativesRankingLoss.", stacklevel=2)
+    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor:
+        # Decorate the forward function of the transformer to cache the embeddings of all layers
+        original_transformer_forward = self.model[0].forward
+        transformer_decorator = TransformerDecorator(self.model[0], original_transformer_forward)
+        self.model[0].forward = transformer_decorator
+        # Decorate the forward function of the model to get the embeddings after all modules (e.g. pooling)
+        original_forward = self.model.forward
+        forward_decorator = ForwardDecorator(original_forward)
+        self.model.forward = forward_decorator
+        # Run the loss normally: i.e. the final layer, but 1) use the transformers decorator to cache
+        # the embeddings of all layers and 2) use the forward decorator to get the embeddings after all modules
+        # for the KL-divergence loss
+        loss = self.loss(sentence_features, labels) * self.last_layer_weight
+        if self.kl_temperature > 0:
+            final_embeddings = forward_decorator.get_embeddings()
+            final_embeddings = F.softmax(final_embeddings / self.kl_temperature, dim=-1)
+        num_layers = transformer_decorator.num_layers
+        layer_indices = range(num_layers - 1)
+        if self.n_layers_per_step > 0 and self.n_layers_per_step < num_layers - 1:
+            layer_indices = random.sample(layer_indices, self.n_layers_per_step)
+        # This loop is over `num_layer - 1` layers because we already computed the loss over the final layer
+        for layer_idx in layer_indices:
+            # Add regular loss for each layer by using the cached embeddings of that layer
+            transformer_decorator.set_layer_idx(layer_idx)
+            layer_loss = self.loss(sentence_features, labels)
+            loss = loss + layer_loss / (1 + layer_idx) / len(layer_indices) * self.prior_layers_weight
+            # and KL-divergence loss between the current layer and the final layer
+            # Note: we use "batchmean" reduction as that aligns with the mathematical definition
+            if self.kl_temperature > 0:
+                embeddings = forward_decorator.get_embeddings()
+                kl_div_loss = F.kl_div(
+                    F.log_softmax(embeddings / self.kl_temperature, dim=-1),
+                    final_embeddings,
+                    reduction="batchmean",
+                )
+                loss = loss + kl_div_loss * self.kl_temperature * self.kl_div_weight
+        self.model[0].forward = original_transformer_forward
+        self.model.forward = original_forward
+        return loss
+    def get_config_dict(self) -> Dict[str, Any]:
+        return {
+            "loss": self.loss.__class__.__name__,
+            "n_layers_per_step": self.n_layers_per_step,
+            "last_layer_weight": self.last_layer_weight,
+            "prior_layers_weight": self.prior_layers_weight,
+            "kl_div_weight": self.kl_div_weight,
+            "kl_temperature": self.kl_temperature,
+        }